[clang] 9a1a7d8 - [SVE] Add more warnings checks to clang and LLVM SVE tests
Author: David Sherwood Date: 2020-07-07T09:33:20+01:00 New Revision: 9a1a7d888b53ebe5a934a8193de37da86e276f1e URL: https://github.com/llvm/llvm-project/commit/9a1a7d888b53ebe5a934a8193de37da86e276f1e DIFF: https://github.com/llvm/llvm-project/commit/9a1a7d888b53ebe5a934a8193de37da86e276f1e.diff LOG: [SVE] Add more warnings checks to clang and LLVM SVE tests There are now more SVE tests in LLVM and Clang that do not emit warnings related to invalid use of EVT::getVectorNumElements() and VectorType::getNumElements(). For these tests I have added additional checks that there are no warnings in order to prevent any future regressions. Differential Revision: https://reviews.llvm.org/D82943 Added: Modified: clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_acge.c clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_acgt.c clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_acle.c clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_aclt.c clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_cmpeq.c clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_cmpge.c clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_cmpgt.c clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_cmple.c clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_cmplt.c clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_cmpne.c clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_cmpuo.c clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_dup.c clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_dupq.c clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_index.c clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld1sb.c clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld1sh.c clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld1sw.c clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld1ub.c clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld1uh.c clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld1uw.c clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ldff1sb.c clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ldff1sh.c clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ldff1sw.c clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ldff1ub.c clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ldff1uh.c clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ldff1uw.c clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ldnf1sb.c clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ldnf1sh.c clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ldnf1sw.c clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ldnf1ub.c clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ldnf1uh.c clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ldnf1uw.c clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_pnext.c clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ptrue.c clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_rev.c clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_setffr.c clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_trn1.c clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_trn2.c clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_undef.c clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_unpkhi.c clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_unpklo.c clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_uzp1.c clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_uzp2.c clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_whilele.c clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_whilelt.c clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_zip1.c clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_zip2.c llvm/test/CodeGen/AArch64/sve-callbyref-notailcall.ll llvm/test/CodeGen/AArch64/sve-calling-convention-byref.ll llvm/test/CodeGen/AArch64/sve-fcmp.ll llvm/test/CodeGen/AArch64/sve-gather-scatter-dag-combine.ll llvm/test/CodeGen/AArch64/sve-gep.ll llvm/test/CodeGen/AArch64/sve-int-arith-imm.ll llvm/test/CodeGen/AArch64/sve-intrinsics-ff-gather-loads-32bit-scaled-offsets.ll llvm/test/CodeGen/AArch64/sve-intrinsics-ff-gather-loads-32bit-unscaled-offsets.ll llvm/test/CodeGen/AArch64/sve-intrinsics-ff-gather-loads-64bit-scaled-offset.ll llvm/test/CodeGen/AArch64/sve-intrinsics-ff-gather-loads-64bit-unscaled-offset.ll llvm/test/CodeGen/AArch64/sve-intrinsics-ff-gather-loads-vector-base-imm-offset.ll llvm/test/CodeGen/AArch64/sve-intrinsics-ff-gather-loads-vector-base-scalar-offset.ll llvm/test/CodeGen/AArch64/sve-intrinsics-gather-loads-32bit-scaled-offsets.ll llvm/test/CodeGen/AArch64/sve-intrinsics-gather-loads-32bit-unscaled-offsets.ll llvm/test/CodeGen/AArch64/sve-intrinsics-gather-loads-64bit-scaled-offset.ll llvm/test/CodeGen/AArch64/sve-intrinsics-gather-loads-64bit-unscaled-offset.ll llvm/test/CodeGen/AArch64/sve-intrinsics-gather-loads-vector-base-imm-offset.ll llvm/test/CodeGen/AArch64/sve
[clang] ae47d15 - Remove "rm -f" workaround in acle_sve_adda.c
Author: David Sherwood Date: 2020-06-26T08:16:40+01:00 New Revision: ae47d158a096abad43d8f9056518d83b66c5a4b7 URL: https://github.com/llvm/llvm-project/commit/ae47d158a096abad43d8f9056518d83b66c5a4b7 DIFF: https://github.com/llvm/llvm-project/commit/ae47d158a096abad43d8f9056518d83b66c5a4b7.diff LOG: Remove "rm -f" workaround in acle_sve_adda.c Added: Modified: clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_adda.c Removed: diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_adda.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_adda.c index 9d9c33a891cd..853da8783faa 100644 --- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_adda.c +++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_adda.c @@ -1,5 +1,4 @@ // REQUIRES: aarch64-registered-target -// RUN: rm -f -- %S/acle_sve_adda.s // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -o - %s >/dev/null 2>%t ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] c02332a - [CodeGen] Fix warning in getNode for EXTRACT_SUBVECTOR
Author: David Sherwood Date: 2020-06-30T08:11:41+01:00 New Revision: c02332a69399a82244298f0097bc98fafdeb3042 URL: https://github.com/llvm/llvm-project/commit/c02332a69399a82244298f0097bc98fafdeb3042 DIFF: https://github.com/llvm/llvm-project/commit/c02332a69399a82244298f0097bc98fafdeb3042.diff LOG: [CodeGen] Fix warning in getNode for EXTRACT_SUBVECTOR Fix a warning in getNode() when extracting a subvector from a concat vector. We can simply replace the call to getVectorNumElements with getVectorMinNumElements as this follows the defined behaviour for EXTRACT_SUBVECTOR. Differential Revision: https://reviews.llvm.org/D82746 Added: Modified: clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_get2.c clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_get3.c clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_get4.c clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st2.c clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st3.c clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st4.c llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp Removed: diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_get2.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_get2.c index 788bad9022b5..7beb191cab30 100644 --- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_get2.c +++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_get2.c @@ -1,6 +1,11 @@ +// REQUIRES: aarch64-registered-target // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -o - %s >/dev/null 2>%t +// RUN: FileCheck --check-prefix=ASM --allow-empty %s <%t +// If this check fails please read test/CodeGen/aarch64-sve-intrinsics/README for instructions on how to resolve it. +// ASM-NOT: warning #include #ifdef SVE_OVERLOADED_FORMS diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_get3.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_get3.c index 502f22d84210..63e17c3e1e0f 100644 --- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_get3.c +++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_get3.c @@ -1,6 +1,11 @@ +// REQUIRES: aarch64-registered-target // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -o - %s >/dev/null 2>%t +// RUN: FileCheck --check-prefix=ASM --allow-empty %s <%t +// If this check fails please read test/CodeGen/aarch64-sve-intrinsics/README for instructions on how to resolve it. +// ASM-NOT: warning #include #ifdef SVE_OVERLOADED_FORMS diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_get4.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_get4.c index 399fa187e83a..a34f41ff3b40 100644 --- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_get4.c +++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_get4.c @@ -1,6 +1,11 @@ +// REQUIRES: aarch64-registered-target // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -o - %s >/dev/null 2>%t +// RUN: FileCheck --check-prefix=ASM --allow-empty %s <%t +// If this check fails please read test/CodeGen/aarch64-sve-intrinsics/README for instructions on how to resolve it. +// ASM-NOT: warning #include #ifdef SVE_OVERLOADED_FORMS diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st2.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st2.c index 7170756d7a98..de21c59bb3b7 100644 --- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st2.c +++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st2.c @@ -1,6 +1,11 @@ +// REQUIRES: aarch64-registered-target // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -emit-llvm -o
[clang] [Clang] Emit TBAA info for enums in C (PR #73326)
david-arm wrote: Hi @AaronBallman, yes the problem I found with always choosing `char` as the alias type is that LLVM will just assume that enum types alias with absolutely everything. This is a conservative approach that works fine, but it does prevent important type-based alias optimisations from happening. GCC seems to take advantage of the fact that enums without any explicit type should be compatible with an `int` (or a suitable integer that the compiler is free to choose that can contain all enumerated values). Consequently it can be more aggressive in optimisations, which is why I started looking at the C specification to understand the rules. If my understanding is correct, I think the important thing is that the type-based alias information should match the actual underlying type we've chosen for the enum. It could be, for example that in the case of an enum like this: ``` enum Foo { A = 1, B = 0x1ull }; ``` gcc chooses a `long long` and clang chooses a `long` since there seems to be wiggle room in the specification. I think that's fine provided the compiler is self-consistent. In either case both `long` and `long long` would not alias with `int` or `short`, which is still an improvement on treating it as a `char` in TBAA info. https://github.com/llvm/llvm-project/pull/73326 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[llvm] [clang] [clang-tools-extra] [LoopVectorize] Improve algorithm for hoisting runtime checks (PR #73515)
david-arm wrote: Gentle ping! https://github.com/llvm/llvm-project/pull/73515 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[llvm] [clang] [Clang] Emit TBAA info for enums in C (PR #73326)
https://github.com/david-arm updated https://github.com/llvm/llvm-project/pull/73326 >From af76f6b6b3469fd0f5f24427c5a175c8d9d7c83a Mon Sep 17 00:00:00 2001 From: David Sherwood Date: Fri, 24 Nov 2023 13:20:23 + Subject: [PATCH 1/3] [Clang] Emit TBAA info for enums in C When emitting TBAA information for enums in C code we currently just treat the data as an 'omnipotent char'. However, with C strict aliasing this means we fail to optimise certain cases. For example, in the SPEC2017 xz benchmark there are structs that contain arrays of enums, and clang pessmistically assumes that accesses to those enums could alias with other struct members that have a different type. According to https://en.cppreference.com/w/c/language/enum enums should be treated as 'int' types unless explicitly specified (C23) or if 'int' would not be large enough to hold all the enumerated values. In the latter case the compiler is free to choose a suitable integer that would hold all such values. When compiling C code this patch generates TBAA information for the enum by using an equivalent integer of the size clang has already chosen for the enum. I have ignored C++ for now because the rules are more complex. New test added here: clang/test/CodeGen/tbaa.c --- clang/lib/CodeGen/CodeGenTBAA.cpp | 5 +- clang/test/CodeGen/tbaa.c | 116 ++ 2 files changed, 120 insertions(+), 1 deletion(-) create mode 100644 clang/test/CodeGen/tbaa.c diff --git a/clang/lib/CodeGen/CodeGenTBAA.cpp b/clang/lib/CodeGen/CodeGenTBAA.cpp index 8705d3d65f1a5..f59d3d422d520 100644 --- a/clang/lib/CodeGen/CodeGenTBAA.cpp +++ b/clang/lib/CodeGen/CodeGenTBAA.cpp @@ -196,11 +196,14 @@ llvm::MDNode *CodeGenTBAA::getTypeInfoHelper(const Type *Ty) { // Enum types are distinct types. In C++ they have "underlying types", // however they aren't related for TBAA. if (const EnumType *ETy = dyn_cast(Ty)) { +if (!Features.CPlusPlus) + return getTypeInfo(Context.getIntTypeForBitwidth(Size * 8, 0)); + // In C++ mode, types have linkage, so we can rely on the ODR and // on their mangled names, if they're external. // TODO: Is there a way to get a program-wide unique name for a // decl with local linkage or no linkage? -if (!Features.CPlusPlus || !ETy->getDecl()->isExternallyVisible()) +if (!ETy->getDecl()->isExternallyVisible()) return getChar(); SmallString<256> OutName; diff --git a/clang/test/CodeGen/tbaa.c b/clang/test/CodeGen/tbaa.c new file mode 100644 index 0..0ab81f60a7194 --- /dev/null +++ b/clang/test/CodeGen/tbaa.c @@ -0,0 +1,116 @@ +// RUN: %clang_cc1 -triple x86_64-apple-darwin -O1 -no-struct-path-tbaa -disable-llvm-passes %s -emit-llvm -o - | FileCheck %s +// RUN: %clang_cc1 -triple x86_64-apple-darwin -O1 -disable-llvm-passes %s -emit-llvm -o - | FileCheck %s -check-prefixes=PATH +// RUN: %clang_cc1 -triple x86_64-apple-darwin -O0 -disable-llvm-passes %s -emit-llvm -o - | FileCheck %s -check-prefix=NO-TBAA +// RUN: %clang_cc1 -triple x86_64-apple-darwin -O1 -relaxed-aliasing -disable-llvm-passes %s -emit-llvm -o - | FileCheck %s -check-prefix=NO-TBAA +// Test TBAA metadata generated by front-end. +// +// NO-TBAA-NOT: !tbaa + +typedef unsigned char uint8_t; +typedef unsigned short uint16_t; +typedef unsigned int uint32_t; +typedef unsigned long long uint64_t; + +typedef enum { + RED_AUTO_32, + GREEN_AUTO_32, + BLUE_AUTO_32 +} EnumAuto32; + +typedef enum { + RED_AUTO_64, + GREEN_AUTO_64, + BLUE_AUTO_64 = 0x1ull +} EnumAuto64; + +typedef enum : uint16_t { + RED_16, + GREEN_16, + BLUE_16 +} Enum16; + +typedef enum : uint8_t { + RED_8, + GREEN_8, + BLUE_8 +} Enum8; + +uint32_t g0(EnumAuto32 *E, uint32_t *val) { +// CHECK-LABEL: define{{.*}} i32 @g0( +// CHECK: store i32 5, ptr %{{.*}}, align 4, !tbaa [[TAG_i32:!.*]] +// CHECK: store i32 0, ptr %{{.*}}, align 4, !tbaa [[TAG_i32]] +// CHECK: load i32, ptr %{{.*}}, align 4, !tbaa [[TAG_i32]] +// PATH-LABEL: define{{.*}} i32 @g0( +// PATH: store i32 5, ptr %{{.*}}, align 4, !tbaa [[TAG_i32:!.*]] +// PATH: store i32 0, ptr %{{.*}}, align 4, !tbaa [[TAG_i32]] +// PATH: load i32, ptr %{{.*}}, align 4, !tbaa [[TAG_i32]] + *val = 5; + *E = RED_AUTO_32; + return *val; +} + +uint64_t g1(EnumAuto64 *E, uint64_t *val) { +// CHECK-LABEL: define{{.*}} i64 @g1( +// CHECK: store i64 5, ptr %{{.*}}, align 8, !tbaa [[TAG_i64:!.*]] +// CHECK: store i64 0, ptr %{{.*}}, align 8, !tbaa [[TAG_long:!.*]] +// CHECK: load i64, ptr %{{.*}}, align 8, !tbaa [[TAG_i64]] +// PATH-LABEL: define{{.*}} i64 @g1( +// PATH: store i64 5, ptr %{{.*}}, align 8, !tbaa [[TAG_i64:!.*]] +// PATH: store i64 0, ptr %{{.*}}, align 8, !tbaa [[TAG_long:!.*]] +// PATH: load i64, ptr %{{.*}}, align 8, !tbaa [[TAG_i64]] + *val = 5; + *E = RED_AUTO_64; + return *val; +} + +uint16_t g2(Enum16 *E, uint16_t *val) { +// CHECK-LABEL: define{{.*}} i16 @g2( +// CHECK: store i16 5, ptr %{{.*}}, align 2, !tbaa [[TAG_i16:!.
[llvm] [clang] [Clang] Emit TBAA info for enums in C (PR #73326)
david-arm wrote: > Do you think it's worth adding something to the Clang release note? Done. Hope the documentation I added makes sense! https://github.com/llvm/llvm-project/pull/73326 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [Clang] Emit TBAA info for enums in C (PR #73326)
david-arm wrote: > I thought the suggestion was to add a few lines to > https://github.com/llvm/llvm-project/blob/main/clang/docs/ReleaseNotes.rst Yes you're right! For some reason I got mixed up with the LangRef, but I guess adding something to the LangRef does no harm either. I'll put something in the release notes too. :) https://github.com/llvm/llvm-project/pull/73326 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[llvm] [clang] [Clang] Emit TBAA info for enums in C (PR #73326)
https://github.com/david-arm updated https://github.com/llvm/llvm-project/pull/73326 >From af76f6b6b3469fd0f5f24427c5a175c8d9d7c83a Mon Sep 17 00:00:00 2001 From: David Sherwood Date: Fri, 24 Nov 2023 13:20:23 + Subject: [PATCH 1/4] [Clang] Emit TBAA info for enums in C When emitting TBAA information for enums in C code we currently just treat the data as an 'omnipotent char'. However, with C strict aliasing this means we fail to optimise certain cases. For example, in the SPEC2017 xz benchmark there are structs that contain arrays of enums, and clang pessmistically assumes that accesses to those enums could alias with other struct members that have a different type. According to https://en.cppreference.com/w/c/language/enum enums should be treated as 'int' types unless explicitly specified (C23) or if 'int' would not be large enough to hold all the enumerated values. In the latter case the compiler is free to choose a suitable integer that would hold all such values. When compiling C code this patch generates TBAA information for the enum by using an equivalent integer of the size clang has already chosen for the enum. I have ignored C++ for now because the rules are more complex. New test added here: clang/test/CodeGen/tbaa.c --- clang/lib/CodeGen/CodeGenTBAA.cpp | 5 +- clang/test/CodeGen/tbaa.c | 116 ++ 2 files changed, 120 insertions(+), 1 deletion(-) create mode 100644 clang/test/CodeGen/tbaa.c diff --git a/clang/lib/CodeGen/CodeGenTBAA.cpp b/clang/lib/CodeGen/CodeGenTBAA.cpp index 8705d3d65f1a5..f59d3d422d520 100644 --- a/clang/lib/CodeGen/CodeGenTBAA.cpp +++ b/clang/lib/CodeGen/CodeGenTBAA.cpp @@ -196,11 +196,14 @@ llvm::MDNode *CodeGenTBAA::getTypeInfoHelper(const Type *Ty) { // Enum types are distinct types. In C++ they have "underlying types", // however they aren't related for TBAA. if (const EnumType *ETy = dyn_cast(Ty)) { +if (!Features.CPlusPlus) + return getTypeInfo(Context.getIntTypeForBitwidth(Size * 8, 0)); + // In C++ mode, types have linkage, so we can rely on the ODR and // on their mangled names, if they're external. // TODO: Is there a way to get a program-wide unique name for a // decl with local linkage or no linkage? -if (!Features.CPlusPlus || !ETy->getDecl()->isExternallyVisible()) +if (!ETy->getDecl()->isExternallyVisible()) return getChar(); SmallString<256> OutName; diff --git a/clang/test/CodeGen/tbaa.c b/clang/test/CodeGen/tbaa.c new file mode 100644 index 0..0ab81f60a7194 --- /dev/null +++ b/clang/test/CodeGen/tbaa.c @@ -0,0 +1,116 @@ +// RUN: %clang_cc1 -triple x86_64-apple-darwin -O1 -no-struct-path-tbaa -disable-llvm-passes %s -emit-llvm -o - | FileCheck %s +// RUN: %clang_cc1 -triple x86_64-apple-darwin -O1 -disable-llvm-passes %s -emit-llvm -o - | FileCheck %s -check-prefixes=PATH +// RUN: %clang_cc1 -triple x86_64-apple-darwin -O0 -disable-llvm-passes %s -emit-llvm -o - | FileCheck %s -check-prefix=NO-TBAA +// RUN: %clang_cc1 -triple x86_64-apple-darwin -O1 -relaxed-aliasing -disable-llvm-passes %s -emit-llvm -o - | FileCheck %s -check-prefix=NO-TBAA +// Test TBAA metadata generated by front-end. +// +// NO-TBAA-NOT: !tbaa + +typedef unsigned char uint8_t; +typedef unsigned short uint16_t; +typedef unsigned int uint32_t; +typedef unsigned long long uint64_t; + +typedef enum { + RED_AUTO_32, + GREEN_AUTO_32, + BLUE_AUTO_32 +} EnumAuto32; + +typedef enum { + RED_AUTO_64, + GREEN_AUTO_64, + BLUE_AUTO_64 = 0x1ull +} EnumAuto64; + +typedef enum : uint16_t { + RED_16, + GREEN_16, + BLUE_16 +} Enum16; + +typedef enum : uint8_t { + RED_8, + GREEN_8, + BLUE_8 +} Enum8; + +uint32_t g0(EnumAuto32 *E, uint32_t *val) { +// CHECK-LABEL: define{{.*}} i32 @g0( +// CHECK: store i32 5, ptr %{{.*}}, align 4, !tbaa [[TAG_i32:!.*]] +// CHECK: store i32 0, ptr %{{.*}}, align 4, !tbaa [[TAG_i32]] +// CHECK: load i32, ptr %{{.*}}, align 4, !tbaa [[TAG_i32]] +// PATH-LABEL: define{{.*}} i32 @g0( +// PATH: store i32 5, ptr %{{.*}}, align 4, !tbaa [[TAG_i32:!.*]] +// PATH: store i32 0, ptr %{{.*}}, align 4, !tbaa [[TAG_i32]] +// PATH: load i32, ptr %{{.*}}, align 4, !tbaa [[TAG_i32]] + *val = 5; + *E = RED_AUTO_32; + return *val; +} + +uint64_t g1(EnumAuto64 *E, uint64_t *val) { +// CHECK-LABEL: define{{.*}} i64 @g1( +// CHECK: store i64 5, ptr %{{.*}}, align 8, !tbaa [[TAG_i64:!.*]] +// CHECK: store i64 0, ptr %{{.*}}, align 8, !tbaa [[TAG_long:!.*]] +// CHECK: load i64, ptr %{{.*}}, align 8, !tbaa [[TAG_i64]] +// PATH-LABEL: define{{.*}} i64 @g1( +// PATH: store i64 5, ptr %{{.*}}, align 8, !tbaa [[TAG_i64:!.*]] +// PATH: store i64 0, ptr %{{.*}}, align 8, !tbaa [[TAG_long:!.*]] +// PATH: load i64, ptr %{{.*}}, align 8, !tbaa [[TAG_i64]] + *val = 5; + *E = RED_AUTO_64; + return *val; +} + +uint16_t g2(Enum16 *E, uint16_t *val) { +// CHECK-LABEL: define{{.*}} i16 @g2( +// CHECK: store i16 5, ptr %{{.*}}, align 2, !tbaa [[TAG_i16:!.
[clang] [llvm] [Clang] Emit TBAA info for enums in C (PR #73326)
https://github.com/david-arm updated https://github.com/llvm/llvm-project/pull/73326 >From af76f6b6b3469fd0f5f24427c5a175c8d9d7c83a Mon Sep 17 00:00:00 2001 From: David Sherwood Date: Fri, 24 Nov 2023 13:20:23 + Subject: [PATCH 1/5] [Clang] Emit TBAA info for enums in C When emitting TBAA information for enums in C code we currently just treat the data as an 'omnipotent char'. However, with C strict aliasing this means we fail to optimise certain cases. For example, in the SPEC2017 xz benchmark there are structs that contain arrays of enums, and clang pessmistically assumes that accesses to those enums could alias with other struct members that have a different type. According to https://en.cppreference.com/w/c/language/enum enums should be treated as 'int' types unless explicitly specified (C23) or if 'int' would not be large enough to hold all the enumerated values. In the latter case the compiler is free to choose a suitable integer that would hold all such values. When compiling C code this patch generates TBAA information for the enum by using an equivalent integer of the size clang has already chosen for the enum. I have ignored C++ for now because the rules are more complex. New test added here: clang/test/CodeGen/tbaa.c --- clang/lib/CodeGen/CodeGenTBAA.cpp | 5 +- clang/test/CodeGen/tbaa.c | 116 ++ 2 files changed, 120 insertions(+), 1 deletion(-) create mode 100644 clang/test/CodeGen/tbaa.c diff --git a/clang/lib/CodeGen/CodeGenTBAA.cpp b/clang/lib/CodeGen/CodeGenTBAA.cpp index 8705d3d65f1a5..f59d3d422d520 100644 --- a/clang/lib/CodeGen/CodeGenTBAA.cpp +++ b/clang/lib/CodeGen/CodeGenTBAA.cpp @@ -196,11 +196,14 @@ llvm::MDNode *CodeGenTBAA::getTypeInfoHelper(const Type *Ty) { // Enum types are distinct types. In C++ they have "underlying types", // however they aren't related for TBAA. if (const EnumType *ETy = dyn_cast(Ty)) { +if (!Features.CPlusPlus) + return getTypeInfo(Context.getIntTypeForBitwidth(Size * 8, 0)); + // In C++ mode, types have linkage, so we can rely on the ODR and // on their mangled names, if they're external. // TODO: Is there a way to get a program-wide unique name for a // decl with local linkage or no linkage? -if (!Features.CPlusPlus || !ETy->getDecl()->isExternallyVisible()) +if (!ETy->getDecl()->isExternallyVisible()) return getChar(); SmallString<256> OutName; diff --git a/clang/test/CodeGen/tbaa.c b/clang/test/CodeGen/tbaa.c new file mode 100644 index 0..0ab81f60a7194 --- /dev/null +++ b/clang/test/CodeGen/tbaa.c @@ -0,0 +1,116 @@ +// RUN: %clang_cc1 -triple x86_64-apple-darwin -O1 -no-struct-path-tbaa -disable-llvm-passes %s -emit-llvm -o - | FileCheck %s +// RUN: %clang_cc1 -triple x86_64-apple-darwin -O1 -disable-llvm-passes %s -emit-llvm -o - | FileCheck %s -check-prefixes=PATH +// RUN: %clang_cc1 -triple x86_64-apple-darwin -O0 -disable-llvm-passes %s -emit-llvm -o - | FileCheck %s -check-prefix=NO-TBAA +// RUN: %clang_cc1 -triple x86_64-apple-darwin -O1 -relaxed-aliasing -disable-llvm-passes %s -emit-llvm -o - | FileCheck %s -check-prefix=NO-TBAA +// Test TBAA metadata generated by front-end. +// +// NO-TBAA-NOT: !tbaa + +typedef unsigned char uint8_t; +typedef unsigned short uint16_t; +typedef unsigned int uint32_t; +typedef unsigned long long uint64_t; + +typedef enum { + RED_AUTO_32, + GREEN_AUTO_32, + BLUE_AUTO_32 +} EnumAuto32; + +typedef enum { + RED_AUTO_64, + GREEN_AUTO_64, + BLUE_AUTO_64 = 0x1ull +} EnumAuto64; + +typedef enum : uint16_t { + RED_16, + GREEN_16, + BLUE_16 +} Enum16; + +typedef enum : uint8_t { + RED_8, + GREEN_8, + BLUE_8 +} Enum8; + +uint32_t g0(EnumAuto32 *E, uint32_t *val) { +// CHECK-LABEL: define{{.*}} i32 @g0( +// CHECK: store i32 5, ptr %{{.*}}, align 4, !tbaa [[TAG_i32:!.*]] +// CHECK: store i32 0, ptr %{{.*}}, align 4, !tbaa [[TAG_i32]] +// CHECK: load i32, ptr %{{.*}}, align 4, !tbaa [[TAG_i32]] +// PATH-LABEL: define{{.*}} i32 @g0( +// PATH: store i32 5, ptr %{{.*}}, align 4, !tbaa [[TAG_i32:!.*]] +// PATH: store i32 0, ptr %{{.*}}, align 4, !tbaa [[TAG_i32]] +// PATH: load i32, ptr %{{.*}}, align 4, !tbaa [[TAG_i32]] + *val = 5; + *E = RED_AUTO_32; + return *val; +} + +uint64_t g1(EnumAuto64 *E, uint64_t *val) { +// CHECK-LABEL: define{{.*}} i64 @g1( +// CHECK: store i64 5, ptr %{{.*}}, align 8, !tbaa [[TAG_i64:!.*]] +// CHECK: store i64 0, ptr %{{.*}}, align 8, !tbaa [[TAG_long:!.*]] +// CHECK: load i64, ptr %{{.*}}, align 8, !tbaa [[TAG_i64]] +// PATH-LABEL: define{{.*}} i64 @g1( +// PATH: store i64 5, ptr %{{.*}}, align 8, !tbaa [[TAG_i64:!.*]] +// PATH: store i64 0, ptr %{{.*}}, align 8, !tbaa [[TAG_long:!.*]] +// PATH: load i64, ptr %{{.*}}, align 8, !tbaa [[TAG_i64]] + *val = 5; + *E = RED_AUTO_64; + return *val; +} + +uint16_t g2(Enum16 *E, uint16_t *val) { +// CHECK-LABEL: define{{.*}} i16 @g2( +// CHECK: store i16 5, ptr %{{.*}}, align 2, !tbaa [[TAG_i16:!.
[llvm] [clang] [Clang] Emit TBAA info for enums in C (PR #73326)
@@ -196,6 +196,9 @@ C Language Changes number of elements in the flexible array member. This information can improve the results of the array bound sanitizer and the ``__builtin_dynamic_object_size`` builtin. +- Enums will now be represented in TBAA metadata using their actual underlying david-arm wrote: The comment is actually under the heading `C Language Changes` so I think that should be clear enough. Is that ok? https://github.com/llvm/llvm-project/pull/73326 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [Clang] Emit TBAA info for enums in C (PR #73326)
https://github.com/david-arm closed https://github.com/llvm/llvm-project/pull/73326 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[llvm] [clang] [clang-tools-extra] [LoopVectorize] Improve algorithm for hoisting runtime checks (PR #73515)
@@ -346,7 +346,9 @@ void RuntimePointerChecking::tryToCreateDiffCheck( auto *SinkStartAR = cast(SinkStartInt); const Loop *StartARLoop = SrcStartAR->getLoop(); if (StartARLoop == SinkStartAR->getLoop() && -StartARLoop == InnerLoop->getParentLoop()) { +StartARLoop == InnerLoop->getParentLoop() && +SrcStartAR->getStepRecurrence(*SE) != david-arm wrote: Hi @fhahn, sorry I only just saw this now! I find with github it's really easy to miss review comments compared to Phabricator... https://github.com/llvm/llvm-project/pull/73515 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang-tools-extra] [clang] [llvm] [LoopVectorize] Improve algorithm for hoisting runtime checks (PR #73515)
https://github.com/david-arm updated https://github.com/llvm/llvm-project/pull/73515 >From 30251642f8c208c63f3f3097c337ef0d5bc633b5 Mon Sep 17 00:00:00 2001 From: David Sherwood Date: Mon, 27 Nov 2023 13:43:26 + Subject: [PATCH 1/4] [LoopVectorize] Improve algorithm for hoisting runtime checks When attempting to hoist runtime checks out of a loop we currently avoid creating pointer diff checks and prefer to do expanded range checks instead. This gives us the opportunity to hoist runtime checks out of a loop, since these checks are loop invariant. However, in some cases the pointer diff checks would also be loop invariant and so will naturally get hoisted. Therefore, since diff checks are cheaper so we should prefer to use those instead. --- llvm/lib/Analysis/LoopAccessAnalysis.cpp | 5 +- .../LoopVectorize/runtime-checks-hoist.ll | 143 ++ 2 files changed, 121 insertions(+), 27 deletions(-) diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp index 3d1edd5f038a25..05765223397987 100644 --- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp +++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp @@ -346,7 +346,10 @@ void RuntimePointerChecking::tryToCreateDiffCheck( auto *SinkStartAR = cast(SinkStartInt); const Loop *StartARLoop = SrcStartAR->getLoop(); if (StartARLoop == SinkStartAR->getLoop() && -StartARLoop == InnerLoop->getParentLoop()) { +StartARLoop == InnerLoop->getParentLoop() && +!SE->isKnownPredicate(ICmpInst::ICMP_EQ, + SrcStartAR->getStepRecurrence(*SE), + SinkStartAR->getStepRecurrence(*SE))) { LLVM_DEBUG(dbgs() << "LAA: Not creating diff runtime check, since these " "cannot be hoisted out of the outer loop\n"); CanUseDiffCheck = false; diff --git a/llvm/test/Transforms/LoopVectorize/runtime-checks-hoist.ll b/llvm/test/Transforms/LoopVectorize/runtime-checks-hoist.ll index 891597cbdc48a8..81702bf34e96be 100644 --- a/llvm/test/Transforms/LoopVectorize/runtime-checks-hoist.ll +++ b/llvm/test/Transforms/LoopVectorize/runtime-checks-hoist.ll @@ -69,11 +69,11 @@ define void @diff_checks(ptr nocapture noundef writeonly %dst, ptr nocapture nou ; CHECK-NEXT:[[TMP14:%.*]] = add nuw nsw i64 [[TMP13]], [[TMP10]] ; CHECK-NEXT:[[TMP15:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP14]] ; CHECK-NEXT:[[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i32 0 -; CHECK-NEXT:[[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP16]], align 4, !alias.scope !0 +; CHECK-NEXT:[[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP16]], align 4, !alias.scope [[META0:![0-9]+]] ; CHECK-NEXT:[[TMP17:%.*]] = add nsw i64 [[TMP13]], [[TMP11]] ; CHECK-NEXT:[[TMP18:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP17]] ; CHECK-NEXT:[[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP18]], i32 0 -; CHECK-NEXT:store <4 x i32> [[WIDE_LOAD]], ptr [[TMP19]], align 4, !alias.scope !3, !noalias !0 +; CHECK-NEXT:store <4 x i32> [[WIDE_LOAD]], ptr [[TMP19]], align 4, !alias.scope [[META3:![0-9]+]], !noalias [[META0]] ; CHECK-NEXT:[[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT:[[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT:br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] @@ -189,12 +189,12 @@ define void @full_checks(ptr nocapture noundef %dst, ptr nocapture noundef reado ; CHECK-NEXT:[[TMP5:%.*]] = add nuw nsw i64 [[TMP4]], [[TMP3]] ; CHECK-NEXT:[[TMP6:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP5]] ; CHECK-NEXT:[[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0 -; CHECK-NEXT:[[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP7]], align 4, !alias.scope !9 +; CHECK-NEXT:[[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP7]], align 4, !alias.scope [[META9:![0-9]+]] ; CHECK-NEXT:[[TMP8:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP5]] ; CHECK-NEXT:[[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0 -; CHECK-NEXT:[[WIDE_LOAD2:%.*]] = load <4 x i32>, ptr [[TMP9]], align 4, !alias.scope !12, !noalias !9 +; CHECK-NEXT:[[WIDE_LOAD2:%.*]] = load <4 x i32>, ptr [[TMP9]], align 4, !alias.scope [[META12:![0-9]+]], !noalias [[META9]] ; CHECK-NEXT:[[TMP10:%.*]] = add nsw <4 x i32> [[WIDE_LOAD2]], [[WIDE_LOAD]] -; CHECK-NEXT:store <4 x i32> [[TMP10]], ptr [[TMP9]], align 4, !alias.scope !12, !noalias !9 +; CHECK-NEXT:store <4 x i32> [[TMP10]], ptr [[TMP9]], align 4, !alias.scope [[META12]], !noalias [[META9]] ; CHECK-NEXT:[[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT:[[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT:br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] @@ -319,13 +319,13 @@ define void @full_ch
[clang-tools-extra] [clang] [llvm] [LoopVectorize] Improve algorithm for hoisting runtime checks (PR #73515)
https://github.com/david-arm updated https://github.com/llvm/llvm-project/pull/73515 >From 30251642f8c208c63f3f3097c337ef0d5bc633b5 Mon Sep 17 00:00:00 2001 From: David Sherwood Date: Mon, 27 Nov 2023 13:43:26 + Subject: [PATCH 1/5] [LoopVectorize] Improve algorithm for hoisting runtime checks When attempting to hoist runtime checks out of a loop we currently avoid creating pointer diff checks and prefer to do expanded range checks instead. This gives us the opportunity to hoist runtime checks out of a loop, since these checks are loop invariant. However, in some cases the pointer diff checks would also be loop invariant and so will naturally get hoisted. Therefore, since diff checks are cheaper so we should prefer to use those instead. --- llvm/lib/Analysis/LoopAccessAnalysis.cpp | 5 +- .../LoopVectorize/runtime-checks-hoist.ll | 143 ++ 2 files changed, 121 insertions(+), 27 deletions(-) diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp index 3d1edd5f038a25..05765223397987 100644 --- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp +++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp @@ -346,7 +346,10 @@ void RuntimePointerChecking::tryToCreateDiffCheck( auto *SinkStartAR = cast(SinkStartInt); const Loop *StartARLoop = SrcStartAR->getLoop(); if (StartARLoop == SinkStartAR->getLoop() && -StartARLoop == InnerLoop->getParentLoop()) { +StartARLoop == InnerLoop->getParentLoop() && +!SE->isKnownPredicate(ICmpInst::ICMP_EQ, + SrcStartAR->getStepRecurrence(*SE), + SinkStartAR->getStepRecurrence(*SE))) { LLVM_DEBUG(dbgs() << "LAA: Not creating diff runtime check, since these " "cannot be hoisted out of the outer loop\n"); CanUseDiffCheck = false; diff --git a/llvm/test/Transforms/LoopVectorize/runtime-checks-hoist.ll b/llvm/test/Transforms/LoopVectorize/runtime-checks-hoist.ll index 891597cbdc48a8..81702bf34e96be 100644 --- a/llvm/test/Transforms/LoopVectorize/runtime-checks-hoist.ll +++ b/llvm/test/Transforms/LoopVectorize/runtime-checks-hoist.ll @@ -69,11 +69,11 @@ define void @diff_checks(ptr nocapture noundef writeonly %dst, ptr nocapture nou ; CHECK-NEXT:[[TMP14:%.*]] = add nuw nsw i64 [[TMP13]], [[TMP10]] ; CHECK-NEXT:[[TMP15:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP14]] ; CHECK-NEXT:[[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i32 0 -; CHECK-NEXT:[[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP16]], align 4, !alias.scope !0 +; CHECK-NEXT:[[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP16]], align 4, !alias.scope [[META0:![0-9]+]] ; CHECK-NEXT:[[TMP17:%.*]] = add nsw i64 [[TMP13]], [[TMP11]] ; CHECK-NEXT:[[TMP18:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP17]] ; CHECK-NEXT:[[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP18]], i32 0 -; CHECK-NEXT:store <4 x i32> [[WIDE_LOAD]], ptr [[TMP19]], align 4, !alias.scope !3, !noalias !0 +; CHECK-NEXT:store <4 x i32> [[WIDE_LOAD]], ptr [[TMP19]], align 4, !alias.scope [[META3:![0-9]+]], !noalias [[META0]] ; CHECK-NEXT:[[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT:[[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT:br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] @@ -189,12 +189,12 @@ define void @full_checks(ptr nocapture noundef %dst, ptr nocapture noundef reado ; CHECK-NEXT:[[TMP5:%.*]] = add nuw nsw i64 [[TMP4]], [[TMP3]] ; CHECK-NEXT:[[TMP6:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP5]] ; CHECK-NEXT:[[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0 -; CHECK-NEXT:[[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP7]], align 4, !alias.scope !9 +; CHECK-NEXT:[[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP7]], align 4, !alias.scope [[META9:![0-9]+]] ; CHECK-NEXT:[[TMP8:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP5]] ; CHECK-NEXT:[[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0 -; CHECK-NEXT:[[WIDE_LOAD2:%.*]] = load <4 x i32>, ptr [[TMP9]], align 4, !alias.scope !12, !noalias !9 +; CHECK-NEXT:[[WIDE_LOAD2:%.*]] = load <4 x i32>, ptr [[TMP9]], align 4, !alias.scope [[META12:![0-9]+]], !noalias [[META9]] ; CHECK-NEXT:[[TMP10:%.*]] = add nsw <4 x i32> [[WIDE_LOAD2]], [[WIDE_LOAD]] -; CHECK-NEXT:store <4 x i32> [[TMP10]], ptr [[TMP9]], align 4, !alias.scope !12, !noalias !9 +; CHECK-NEXT:store <4 x i32> [[TMP10]], ptr [[TMP9]], align 4, !alias.scope [[META12]], !noalias [[META9]] ; CHECK-NEXT:[[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT:[[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT:br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] @@ -319,13 +319,13 @@ define void @full_ch
[llvm] [clang] [AArch64][SVE2.1] Add intrinsics for quadword loads/stores with unscaled offset (PR #70474)
https://github.com/david-arm commented: Wow, this is a huge patch. :) It took me a few hours to work through all the tests, and it's quite possible I've missed something. However, overall it looks fine and I can't see any major issues. I think there is one missing test, but once that's fixed I'll approve it! https://github.com/llvm/llvm-project/pull/70474 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[llvm] [clang] [AArch64][SVE2.1] Add intrinsics for quadword loads/stores with unscaled offset (PR #70474)
https://github.com/david-arm edited https://github.com/llvm/llvm-project/pull/70474 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AArch64][SVE2.1] Add intrinsics for quadword loads/stores with unscaled offset (PR #70474)
@@ -0,0 +1,2503 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s + +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3 +#else +#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4 +#endif + +// CHECK-LABEL: @test_svld2q_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT:[[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.ld2q.sret.nxv16i8( [[PG:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT:[[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT:[[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP1]], i64 0) +// CHECK-NEXT:[[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT:[[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CHECK-NEXT:ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z14test_svld2q_u8u10__SVBool_tPKh( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT:[[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.ld2q.sret.nxv16i8( [[PG:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT:[[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT:[[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT:[[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT:[[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CPP-CHECK-NEXT:ret [[TMP4]] +// +svuint8x2_t test_svld2q_u8(svbool_t pg, const uint8_t *base) +{ + return SVE_ACLE_FUNC(svld2q,,_u8,)(pg, base); +} + +// CHECK-LABEL: @test_svld2q_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT:[[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.ld2q.sret.nxv16i8( [[PG:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT:[[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT:[[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP1]], i64 0) +// CHECK-NEXT:[[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT:[[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CHECK-NEXT:ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z14test_svld2q_s8u10__SVBool_tPKa( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT:[[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.ld2q.sret.nxv16i8( [[PG:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT:[[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT:[[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT:[[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT:[[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CPP-CHECK-NEXT:ret [[TMP4]] +// +svint8x2_t test_svld2q_s8(svbool_t pg, const int8_t *base) +{ + return SVE_ACLE_FUNC(svld2q,,_s8,)(pg, base); +} +// CHECK-LABEL: @test_svld2q_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT:[[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT:[[TMP1:%.*]] = tail call { , } @llvm.aarch64.sve.ld2q.sret.nxv8i16( [[TMP0]], ptr [[BASE:%.*]]) +// CHECK-NEXT:[[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT:[[TMP3:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP2]], i64 0) +// CHECK-NEXT:[[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT:[[TMP5:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP3]], [[TMP4]], i64 8) +// CHECK-NEXT:ret [[TMP5]] +// +// CPP-CHECK-LABEL: @_Z15test_svld2q_u16u10__SVBool_tPKt( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT:[[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]])
[clang] [llvm] [AArch64][SVE2.1] Add intrinsics for quadword loads/stores with unscaled offset (PR #70474)
https://github.com/david-arm approved this pull request. LGTM. I think I would have preferred the patch to be split up into 3 - one for contiguous extending loads/truncating stores, one for structured loads/stores, and one for the gathers. That's why it took me so long to review this patch as I was constantly trying to keep all the information about each builtin/instruction in my head whilst reviewing the tests for correctness! https://github.com/llvm/llvm-project/pull/70474 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [AArch64][Clang] Refactor code to emit SVE & SME builtins (PR #70959)
https://github.com/david-arm approved this pull request. LGTM! https://github.com/llvm/llvm-project/pull/70959 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AArch64][SVE2.1] Add intrinsics for quadword loads/stores with unscaled offset (PR #70474)
https://github.com/david-arm commented: Thanks for this! I've not done an exhaustive review, but I'll leave the comments I have so far. https://github.com/llvm/llvm-project/pull/70474 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AArch64][SVE2.1] Add intrinsics for quadword loads/stores with unscaled offset (PR #70474)
https://github.com/david-arm edited https://github.com/llvm/llvm-project/pull/70474 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AArch64][SVE2.1] Add intrinsics for quadword loads/stores with unscaled offset (PR #70474)
@@ -9702,17 +9727,34 @@ Value *CodeGenFunction::EmitSVEMaskedStore(const CallExpr *E, auto VectorTy = cast(Ops.back()->getType()); auto MemoryTy = llvm::ScalableVectorType::get(MemEltTy, VectorTy); - Value *Predicate = EmitSVEPredicateCast(Ops[0], MemoryTy); + auto PredTy = MemoryTy; + auto AddrMemoryTy = MemoryTy; + bool IsTruncatingStore = true; david-arm wrote: Same comment as in EmitSVEMaskedLoad. Perhaps better just to have a IsQuadStore boolean, since it's an exceptional case and unlikely to have commonality with other instructions? https://github.com/llvm/llvm-project/pull/70474 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AArch64][SVE2.1] Add intrinsics for quadword loads/stores with unscaled offset (PR #70474)
@@ -2614,6 +2619,37 @@ def int_aarch64_sve_ld1_pn_x4 : SVE2p1_Load_PN_X4_Intrinsic; def int_aarch64_sve_ldnt1_pn_x2 : SVE2p1_Load_PN_X2_Intrinsic; def int_aarch64_sve_ldnt1_pn_x4 : SVE2p1_Load_PN_X4_Intrinsic; +// +// SVE2.1 - Contiguous loads to quadword (single vector) +// + +class SVE2p1_Single_Load_Quadword +: DefaultAttrsIntrinsic<[llvm_anyvector_ty], +[llvm_nxv1i1_ty, llvm_ptr_ty], +[IntrReadMem]>; david-arm wrote: I think this should also have IntrArgMemOnly too, similar to AdvSIMD_1Vec_Load_Intrinsic. https://github.com/llvm/llvm-project/pull/70474 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[llvm] [clang] [AArch64][SVE2.1] Add intrinsics for quadword loads/stores with unscaled offset (PR #70474)
@@ -9671,28 +9677,47 @@ Value *CodeGenFunction::EmitSVEMaskedLoad(const CallExpr *E, // The vector type that is returned may be different from the // eventual type loaded from memory. auto VectorTy = cast(ReturnTy); - auto MemoryTy = llvm::ScalableVectorType::get(MemEltTy, VectorTy); + llvm::ScalableVectorType *MemoryTy = nullptr; + llvm::ScalableVectorType *PredTy = nullptr; + bool IsExtendingLoad = true; david-arm wrote: I personally think using this variable is misleading because aarch64_sve_ld1uwq is actually an extending load - we're extending from 32-bit memory elements to 128-bit integer elements. So it looks odd when we set this to false. Perhaps it's better to just explicitly have a variable called `IsQuadLoad` and use that instead rather than try to generalise this. The quad-word loads are a really just an exception here because we're working around the lack of a type. So you'd have something like case Intrinsic::aarch64_sve_ld1uwq: IsQuadLoad = true; ... default: IsQuadLoad = false; Function *F = CGM.getIntrinsic(IntrinsicID, IsQuadLoad ? VectorTy : MemoryTy);\ ... if (IsQuadLoad) return Load; https://github.com/llvm/llvm-project/pull/70474 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[llvm] [clang] [AArch64][SVE2.1] Add intrinsics for quadword loads/stores with unscaled offset (PR #70474)
@@ -9702,17 +9727,34 @@ Value *CodeGenFunction::EmitSVEMaskedStore(const CallExpr *E, auto VectorTy = cast(Ops.back()->getType()); auto MemoryTy = llvm::ScalableVectorType::get(MemEltTy, VectorTy); - Value *Predicate = EmitSVEPredicateCast(Ops[0], MemoryTy); + auto PredTy = MemoryTy; + auto AddrMemoryTy = MemoryTy; + bool IsTruncatingStore = true; + ; david-arm wrote: Extra ; here https://github.com/llvm/llvm-project/pull/70474 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[llvm] [clang] [AArch64][SVE2.1] Add intrinsics for quadword loads/stores with unscaled offset (PR #70474)
@@ -9671,28 +9677,47 @@ Value *CodeGenFunction::EmitSVEMaskedLoad(const CallExpr *E, // The vector type that is returned may be different from the // eventual type loaded from memory. auto VectorTy = cast(ReturnTy); - auto MemoryTy = llvm::ScalableVectorType::get(MemEltTy, VectorTy); + llvm::ScalableVectorType *MemoryTy = nullptr; + llvm::ScalableVectorType *PredTy = nullptr; + bool IsExtendingLoad = true; + switch (IntrinsicID) { + case Intrinsic::aarch64_sve_ld1uwq: + case Intrinsic::aarch64_sve_ld1udq: +MemoryTy = llvm::ScalableVectorType::get(MemEltTy, 1); +PredTy = +llvm::ScalableVectorType::get(IntegerType::get(getLLVMContext(), 1), 1); david-arm wrote: You can just do llvm::ScalableVectorType::get(Type::getInt1Ty(getLLVMContext()), 1); https://github.com/llvm/llvm-project/pull/70474 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[llvm] [clang] [AArch64][SVE2.1] Add intrinsics for quadword loads/stores with unscaled offset (PR #70474)
@@ -2614,6 +2619,37 @@ def int_aarch64_sve_ld1_pn_x4 : SVE2p1_Load_PN_X4_Intrinsic; def int_aarch64_sve_ldnt1_pn_x2 : SVE2p1_Load_PN_X2_Intrinsic; def int_aarch64_sve_ldnt1_pn_x4 : SVE2p1_Load_PN_X4_Intrinsic; +// +// SVE2.1 - Contiguous loads to quadword (single vector) +// + +class SVE2p1_Single_Load_Quadword +: DefaultAttrsIntrinsic<[llvm_anyvector_ty], +[llvm_nxv1i1_ty, llvm_ptr_ty], +[IntrReadMem]>; +def int_aarch64_sve_ld1uwq : SVE2p1_Single_Load_Quadword; +def int_aarch64_sve_ld1udq : SVE2p1_Single_Load_Quadword; + +// +// SVE2.1 - Contiguous store from quadword (single vector) +// + +class SVE2p1_Single_Store_Quadword +: DefaultAttrsIntrinsic<[], +[llvm_anyvector_ty, llvm_nxv1i1_ty, llvm_ptr_ty], +[IntrArgMemOnly]>; david-arm wrote: This also needs the IntrWriteMem flag otherwise we could end up incorrectly rescheduling stores in the wrong place. https://github.com/llvm/llvm-project/pull/70474 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [AArch64] Cast predicate operand of SVE gather loads/scater stores to the parameter type of the intrinsic (NFC) (PR #71289)
https://github.com/david-arm approved this pull request. LGTM! https://github.com/llvm/llvm-project/pull/71289 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang-tools-extra] [clang] [llvm] [LoopVectorize] Improve algorithm for hoisting runtime checks (PR #73515)
https://github.com/david-arm closed https://github.com/llvm/llvm-project/pull/73515 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang-tools-extra] [llvm] [LoopVectorize] Enable hoisting of runtime checks by default (PR #71538)
https://github.com/david-arm updated https://github.com/llvm/llvm-project/pull/71538 >From 8a2af20a52fd851eaff1cfa7d50df8b994d0db0d Mon Sep 17 00:00:00 2001 From: David Sherwood Date: Tue, 7 Nov 2023 13:57:17 + Subject: [PATCH 1/2] [LoopVectorize] Enable hoisting of runtime checks by default With commit https://reviews.llvm.org/D152366 I introduced functionality that permitted the hoisting of runtime memory checks from a vectorised inner loop to the preheader of the next outer-most loop. This is useful for benchmarks like SPEC2017's x264 where the inner loop is vectorised and only has a small trip count. In such cases the runtime memory checks become expensive and since the checks never fail in the case of x264 it makes sense to do this. However, this behaviour was controlled by the flag -hoist-runtime-checks which was off by default. This patch enables this flag by default for all targets, since I believe this is a generally beneficial thing to do. I have tested this with SPEC2017 and I see 2.3% and 2.6% improvements with x264 on neoverse-v1 and neoverse-n1, respectively. Similarly, I saw slight improvements in the overall geomean on both machines. The only other notable changes were a 1% drop in the roms benchmark, which was compensated for by a 1% improvement in fotonik3d. --- llvm/lib/Analysis/LoopAccessAnalysis.cpp | 2 +- .../invariant-store-vectorization.ll | 86 +- .../multiple-strides-vectorization.ll | 90 --- .../runtime-checks-difference.ll | 2 +- .../LoopVectorize/runtime-checks-hoist.ll | 2 +- 5 files changed, 124 insertions(+), 58 deletions(-) diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp index 3d1edd5f038a25..05ca09968207fe 100644 --- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp +++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp @@ -142,7 +142,7 @@ static cl::opt HoistRuntimeChecks( "hoist-runtime-checks", cl::Hidden, cl::desc( "Hoist inner loop runtime memory checks to outer loop if possible"), -cl::location(VectorizerParams::HoistRuntimeChecks), cl::init(false)); +cl::location(VectorizerParams::HoistRuntimeChecks), cl::init(true)); bool VectorizerParams::HoistRuntimeChecks; bool VectorizerParams::isInterleaveForced() { diff --git a/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization.ll b/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization.ll index 9e36649bcf73d6..52101fda6309f6 100644 --- a/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization.ll +++ b/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization.ll @@ -13,9 +13,6 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3 ; address. -; memory check is found.conflict = b[max(n-1,1)] > a && (ptr a)+1 > (ptr b) - - define i32 @inv_val_store_to_inv_address_with_reduction(ptr %a, i64 %n, ptr %b) { ; CHECK-LABEL: @inv_val_store_to_inv_address_with_reduction( ; CHECK-NEXT: entry: @@ -346,74 +343,75 @@ define i32 @multiple_uniform_stores(ptr nocapture %var1, ptr nocapture readonly ; CHECK-NEXT:[[CMP20:%.*]] = icmp eq i32 [[ITR:%.*]], 0 ; CHECK-NEXT:br i1 [[CMP20]], label [[FOR_END10:%.*]], label [[FOR_COND1_PREHEADER_PREHEADER:%.*]] ; CHECK: for.cond1.preheader.preheader: -; CHECK-NEXT:[[SCEVGEP3:%.*]] = getelementptr i8, ptr [[VAR2:%.*]], i64 4 -; CHECK-NEXT:[[INVARIANT_GEP5:%.*]] = getelementptr i8, ptr [[VAR1:%.*]], i64 4 +; CHECK-NEXT:[[TMP0:%.*]] = add i32 [[ITR]], -1 +; CHECK-NEXT:[[TMP1:%.*]] = zext i32 [[TMP0]] to i64 +; CHECK-NEXT:[[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2 +; CHECK-NEXT:[[TMP3:%.*]] = getelementptr i8, ptr [[VAR1:%.*]], i64 [[TMP2]] +; CHECK-NEXT:[[SCEVGEP:%.*]] = getelementptr i8, ptr [[TMP3]], i64 4 +; CHECK-NEXT:[[SCEVGEP2:%.*]] = getelementptr i8, ptr [[VAR2:%.*]], i64 4 ; CHECK-NEXT:br label [[FOR_COND1_PREHEADER:%.*]] ; CHECK: for.cond1.preheader: ; CHECK-NEXT:[[INDVARS_IV23:%.*]] = phi i64 [ [[INDVARS_IV_NEXT24:%.*]], [[FOR_INC8:%.*]] ], [ 0, [[FOR_COND1_PREHEADER_PREHEADER]] ] ; CHECK-NEXT:[[J_022:%.*]] = phi i32 [ [[J_1_LCSSA:%.*]], [[FOR_INC8]] ], [ 0, [[FOR_COND1_PREHEADER_PREHEADER]] ] -; CHECK-NEXT:[[TMP0:%.*]] = shl nuw nsw i64 [[INDVARS_IV23]], 2 -; CHECK-NEXT:[[SCEVGEP:%.*]] = getelementptr i8, ptr [[VAR1]], i64 [[TMP0]] -; CHECK-NEXT:[[GEP6:%.*]] = getelementptr i8, ptr [[INVARIANT_GEP5]], i64 [[TMP0]] ; CHECK-NEXT:[[CMP218:%.*]] = icmp ult i32 [[J_022]], [[ITR]] ; CHECK-NEXT:br i1 [[CMP218]], label [[FOR_BODY3_LR_PH:%.*]], label [[FOR_INC8]] ; CHECK: for.body3.lr.ph: ; CHECK-NEXT:[[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[VAR1]], i64 [[INDVARS_IV23]] -; CHECK-NEXT:[[TMP1:%.*]] = zext i32 [[J_022]] to i64 +; CHECK-NEXT:[[TMP4:%.*]] = zext i32 [[J_022]] to i64 ; CHECK-NEXT:[[ARRAYIDX5_PROMOTED:%.*]] = load i32, pt
[clang-tools-extra] [llvm] [clang] [AArch64] Add an AArch64 pass for loop idiom transformations (PR #72273)
@@ -0,0 +1,726 @@ + +//===- AArch64LoopIdiomTransform.cpp - Loop idiom recognition -===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===--===// + +#include "AArch64LoopIdiomTransform.h" +#include "llvm/Analysis/DomTreeUpdater.h" +#include "llvm/Analysis/LoopPass.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/MDBuilder.h" +#include "llvm/IR/PatternMatch.h" +#include "llvm/InitializePasses.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" + +using namespace llvm; + +#define DEBUG_TYPE "aarch64-lit" david-arm wrote: Sorry, I just realised I have lost this change somehow. I'll fix it. https://github.com/llvm/llvm-project/pull/72273 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [clang-tools-extra] [llvm] [AArch64] Add an AArch64 pass for loop idiom transformations (PR #72273)
@@ -0,0 +1,726 @@ + +//===- AArch64LoopIdiomTransform.cpp - Loop idiom recognition -===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===--===// + +#include "AArch64LoopIdiomTransform.h" +#include "llvm/Analysis/DomTreeUpdater.h" +#include "llvm/Analysis/LoopPass.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/MDBuilder.h" +#include "llvm/IR/PatternMatch.h" +#include "llvm/InitializePasses.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" + +using namespace llvm; + +#define DEBUG_TYPE "aarch64-lit" + +static cl::opt +DisableAll("disable-aarch64-lit-all", cl::Hidden, cl::init(false), + cl::desc("Disable AArch64 Loop Idiom Transform Pass.")); + +static cl::opt DisableByteCmp( +"disable-aarch64-lit-bytecmp", cl::Hidden, cl::init(false), +cl::desc("Proceed with AArch64 Loop Idiom Transform Pass, but do " + "not convert byte-compare loop(s).")); + +namespace llvm { + +void initializeAArch64LoopIdiomTransformLegacyPassPass(PassRegistry &); +Pass *createAArch64LoopIdiomTransformPass(); + +} // end namespace llvm + +namespace { + +class AArch64LoopIdiomTransform { + Loop *CurLoop = nullptr; + DominatorTree *DT; + LoopInfo *LI; + const TargetTransformInfo *TTI; + const DataLayout *DL; + +public: + explicit AArch64LoopIdiomTransform(DominatorTree *DT, LoopInfo *LI, + const TargetTransformInfo *TTI, + const DataLayout *DL) + : DT(DT), LI(LI), TTI(TTI), DL(DL) {} + + bool run(Loop *L); + +private: + /// \name Countable Loop Idiom Handling + /// @{ + + bool runOnCountableLoop(); + bool runOnLoopBlock(BasicBlock *BB, const SCEV *BECount, + SmallVectorImpl &ExitBlocks); + + bool recognizeByteCompare(); + Value *expandFindMismatch(IRBuilder<> &Builder, GetElementPtrInst *GEPA, +GetElementPtrInst *GEPB, Value *Start, +Value *MaxLen); + void transformByteCompare(GetElementPtrInst *GEPA, GetElementPtrInst *GEPB, +Value *MaxLen, Value *Index, Value *Start, +bool IncIdx, BasicBlock *FoundBB, +BasicBlock *EndBB); + /// @} +}; + +class AArch64LoopIdiomTransformLegacyPass : public LoopPass { +public: + static char ID; + + explicit AArch64LoopIdiomTransformLegacyPass() : LoopPass(ID) { +initializeAArch64LoopIdiomTransformLegacyPassPass( +*PassRegistry::getPassRegistry()); + } + + StringRef getPassName() const override { +return "Recognize AArch64-specific loop idioms"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { +AU.addRequired(); +AU.addRequired(); +AU.addRequired(); + } + + bool runOnLoop(Loop *L, LPPassManager &LPM) override; +}; + +bool AArch64LoopIdiomTransformLegacyPass::runOnLoop(Loop *L, +LPPassManager &LPM) { + + if (skipLoop(L)) +return false; + + auto *DT = &getAnalysis().getDomTree(); + auto *LI = &getAnalysis().getLoopInfo(); + auto &TTI = getAnalysis().getTTI( + *L->getHeader()->getParent()); + return AArch64LoopIdiomTransform( + DT, LI, &TTI, &L->getHeader()->getModule()->getDataLayout()) + .run(L); +} + +} // end anonymous namespace + +char AArch64LoopIdiomTransformLegacyPass::ID = 0; + +INITIALIZE_PASS_BEGIN( +AArch64LoopIdiomTransformLegacyPass, "aarch64-lit", +"Transform specific loop idioms into optimised vector forms", false, false) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(LoopSimplify) +INITIALIZE_PASS_DEPENDENCY(LCSSAWrapperPass) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) +INITIALIZE_PASS_END( +AArch64LoopIdiomTransformLegacyPass, "aarch64-lit", +"Transform specific loop idioms into optimised vector forms", false, false) + +Pass *llvm::createAArch64LoopIdiomTransformPass() { + return new AArch64LoopIdiomTransformLegacyPass(); +} + +PreservedAnalyses +AArch64LoopIdiomTransformPass::run(Loop &L, LoopAnalysisManager &AM, + LoopStandardAnalysisResults &AR, + LPMUpdater &) { + if (DisableAll) +return PreservedAnalyses::all(); + + const auto *DL = &L.getHeader()->getModule()->getDataLayout(); + + AArch64LoopIdiomTransform LIT(&AR.DT, &AR.LI, &AR.TTI, DL); + if (!LIT.run(&L)) +return PreservedAnalyses::all(); + + return PreservedAnalyses::none(); +} + +//===-
[llvm] [clang-tools-extra] [LoopVectorize] Enable hoisting of runtime checks by default (PR #71538)
david-arm wrote: Gentle ping! https://github.com/llvm/llvm-project/pull/73515 has now landed so I think this patch should be ready to go. https://github.com/llvm/llvm-project/pull/71538 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [Clang][SME2] Add multi-vector zip & unzip builtins (PR #74841)
david-arm wrote: For builtins that operate purely on SVE vectors I think we've used the convention of adding _vector_ to the test name, i.e. see acle_sme2_vector_rshl.c, etc. Should we do the same here? https://github.com/llvm/llvm-project/pull/74841 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang-tools-extra] [clang] [llvm] [AArch64] Add an AArch64 pass for loop idiom transformations (PR #72273)
@@ -0,0 +1,839 @@ +//===- AArch64LoopIdiomTransform.cpp - Loop idiom recognition -===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===--===// +// +// This pass implements a pass that recognizes certain loop idioms and +// transforms them into more optimized versions of the same loop. In cases +// where this happens, it can be a significant performance win. +// +// We currently only recognize one loop that finds the first mismatched byte +// in an array and returns the index, i.e. something like: +// +// while (++i != n) { +//if (a[i] != b[i]) +// break; +// } +// +// In this example we can actually vectorize the loop despite the early exit, +// although the loop vectorizer does not support it. It requires some extra +// checks to deal with the possibility of faulting loads when crossing page +// boundaries. However, even with these checks it is still profitable to do the +// transformation. +// +//===--===// +// +// TODO List: +// +// * When optimizing for code size we may want to avoid some transformations. +// * We can also support the inverse case where we scan for a matching element. +// +//===--===// + +#include "AArch64LoopIdiomTransform.h" +#include "llvm/Analysis/DomTreeUpdater.h" +#include "llvm/Analysis/LoopPass.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/MDBuilder.h" +#include "llvm/IR/PatternMatch.h" +#include "llvm/InitializePasses.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" + +using namespace llvm; + +#define DEBUG_TYPE "aarch64-loop-idiom-transform" + +static cl::opt +DisableAll("disable-aarch64-lit-all", cl::Hidden, cl::init(false), + cl::desc("Disable AArch64 Loop Idiom Transform Pass.")); + +static cl::opt DisableByteCmp( +"disable-aarch64-lit-bytecmp", cl::Hidden, cl::init(false), +cl::desc("Proceed with AArch64 Loop Idiom Transform Pass, but do " + "not convert byte-compare loop(s).")); + +static cl::opt VerifyLoops( +"aarch64-lit-verify", cl::Hidden, cl::init(false), +cl::desc("Verify loops generated AArch64 Loop Idiom Transform Pass.")); + +namespace llvm { + +void initializeAArch64LoopIdiomTransformLegacyPassPass(PassRegistry &); +Pass *createAArch64LoopIdiomTransformPass(); + +} // end namespace llvm + +namespace { + +class AArch64LoopIdiomTransform { + Loop *CurLoop = nullptr; + DominatorTree *DT; + LoopInfo *LI; + const TargetTransformInfo *TTI; + const DataLayout *DL; + +public: + explicit AArch64LoopIdiomTransform(DominatorTree *DT, LoopInfo *LI, + const TargetTransformInfo *TTI, + const DataLayout *DL) + : DT(DT), LI(LI), TTI(TTI), DL(DL) {} + + bool run(Loop *L); + +private: + /// \name Countable Loop Idiom Handling + /// @{ + + bool runOnCountableLoop(); + bool runOnLoopBlock(BasicBlock *BB, const SCEV *BECount, + SmallVectorImpl &ExitBlocks); + + bool recognizeByteCompare(); + Value *expandFindMismatch(IRBuilder<> &Builder, GetElementPtrInst *GEPA, +GetElementPtrInst *GEPB, Value *Start, +Value *MaxLen); + void transformByteCompare(GetElementPtrInst *GEPA, GetElementPtrInst *GEPB, +Value *MaxLen, Value *Index, Value *Start, +bool IncIdx, BasicBlock *FoundBB, +BasicBlock *EndBB); + /// @} +}; + +class AArch64LoopIdiomTransformLegacyPass : public LoopPass { +public: + static char ID; + + explicit AArch64LoopIdiomTransformLegacyPass() : LoopPass(ID) { +initializeAArch64LoopIdiomTransformLegacyPassPass( +*PassRegistry::getPassRegistry()); + } + + StringRef getPassName() const override { +return "Transform AArch64-specific loop idioms"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { +AU.addRequired(); +AU.addRequired(); +AU.addRequired(); + } + + bool runOnLoop(Loop *L, LPPassManager &LPM) override; +}; + +bool AArch64LoopIdiomTransformLegacyPass::runOnLoop(Loop *L, +LPPassManager &LPM) { + + if (skipLoop(L)) +return false; + + auto *DT = &getAnalysis().getDomTree(); + auto *LI = &getAnalysis().getLoopInfo(); + auto &TTI = getAnalysis().getTTI( + *L->getHeader()->getParent()); + return AArch64LoopIdiomTransform( + DT, LI, &TTI, &L->getHeader()->getModule()->getDataLayout()) + .run(L); +} + +} // end anonymou
[clang-tools-extra] [clang] [llvm] [AArch64] Add an AArch64 pass for loop idiom transformations (PR #72273)
@@ -0,0 +1,839 @@ +//===- AArch64LoopIdiomTransform.cpp - Loop idiom recognition -===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===--===// +// +// This pass implements a pass that recognizes certain loop idioms and +// transforms them into more optimized versions of the same loop. In cases +// where this happens, it can be a significant performance win. +// +// We currently only recognize one loop that finds the first mismatched byte +// in an array and returns the index, i.e. something like: +// +// while (++i != n) { +//if (a[i] != b[i]) +// break; +// } +// +// In this example we can actually vectorize the loop despite the early exit, +// although the loop vectorizer does not support it. It requires some extra +// checks to deal with the possibility of faulting loads when crossing page +// boundaries. However, even with these checks it is still profitable to do the +// transformation. +// +//===--===// +// +// TODO List: +// +// * When optimizing for code size we may want to avoid some transformations. +// * We can also support the inverse case where we scan for a matching element. +// +//===--===// + +#include "AArch64LoopIdiomTransform.h" +#include "llvm/Analysis/DomTreeUpdater.h" +#include "llvm/Analysis/LoopPass.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/MDBuilder.h" +#include "llvm/IR/PatternMatch.h" +#include "llvm/InitializePasses.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" + +using namespace llvm; + +#define DEBUG_TYPE "aarch64-loop-idiom-transform" + +static cl::opt +DisableAll("disable-aarch64-lit-all", cl::Hidden, cl::init(false), + cl::desc("Disable AArch64 Loop Idiom Transform Pass.")); + +static cl::opt DisableByteCmp( +"disable-aarch64-lit-bytecmp", cl::Hidden, cl::init(false), +cl::desc("Proceed with AArch64 Loop Idiom Transform Pass, but do " + "not convert byte-compare loop(s).")); + +static cl::opt VerifyLoops( +"aarch64-lit-verify", cl::Hidden, cl::init(false), +cl::desc("Verify loops generated AArch64 Loop Idiom Transform Pass.")); + +namespace llvm { + +void initializeAArch64LoopIdiomTransformLegacyPassPass(PassRegistry &); +Pass *createAArch64LoopIdiomTransformPass(); + +} // end namespace llvm + +namespace { + +class AArch64LoopIdiomTransform { + Loop *CurLoop = nullptr; + DominatorTree *DT; + LoopInfo *LI; + const TargetTransformInfo *TTI; + const DataLayout *DL; + +public: + explicit AArch64LoopIdiomTransform(DominatorTree *DT, LoopInfo *LI, + const TargetTransformInfo *TTI, + const DataLayout *DL) + : DT(DT), LI(LI), TTI(TTI), DL(DL) {} + + bool run(Loop *L); + +private: + /// \name Countable Loop Idiom Handling + /// @{ + + bool runOnCountableLoop(); + bool runOnLoopBlock(BasicBlock *BB, const SCEV *BECount, + SmallVectorImpl &ExitBlocks); + + bool recognizeByteCompare(); + Value *expandFindMismatch(IRBuilder<> &Builder, GetElementPtrInst *GEPA, +GetElementPtrInst *GEPB, Value *Start, +Value *MaxLen); + void transformByteCompare(GetElementPtrInst *GEPA, GetElementPtrInst *GEPB, +Value *MaxLen, Value *Index, Value *Start, +bool IncIdx, BasicBlock *FoundBB, +BasicBlock *EndBB); + /// @} +}; + +class AArch64LoopIdiomTransformLegacyPass : public LoopPass { +public: + static char ID; + + explicit AArch64LoopIdiomTransformLegacyPass() : LoopPass(ID) { +initializeAArch64LoopIdiomTransformLegacyPassPass( +*PassRegistry::getPassRegistry()); + } + + StringRef getPassName() const override { +return "Transform AArch64-specific loop idioms"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { +AU.addRequired(); +AU.addRequired(); +AU.addRequired(); + } + + bool runOnLoop(Loop *L, LPPassManager &LPM) override; +}; + +bool AArch64LoopIdiomTransformLegacyPass::runOnLoop(Loop *L, +LPPassManager &LPM) { + + if (skipLoop(L)) +return false; + + auto *DT = &getAnalysis().getDomTree(); + auto *LI = &getAnalysis().getLoopInfo(); + auto &TTI = getAnalysis().getTTI( + *L->getHeader()->getParent()); + return AArch64LoopIdiomTransform( + DT, LI, &TTI, &L->getHeader()->getModule()->getDataLayout()) + .run(L); +} + +} // end anonymou
[llvm] [clang] [clang-tools-extra] [AArch64] Add an AArch64 pass for loop idiom transformations (PR #72273)
@@ -0,0 +1,726 @@ + +//===- AArch64LoopIdiomTransform.cpp - Loop idiom recognition -===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===--===// + +#include "AArch64LoopIdiomTransform.h" +#include "llvm/Analysis/DomTreeUpdater.h" +#include "llvm/Analysis/LoopPass.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/MDBuilder.h" +#include "llvm/IR/PatternMatch.h" +#include "llvm/InitializePasses.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" + +using namespace llvm; + +#define DEBUG_TYPE "aarch64-lit" + +static cl::opt +DisableAll("disable-aarch64-lit-all", cl::Hidden, cl::init(false), + cl::desc("Disable AArch64 Loop Idiom Transform Pass.")); + +static cl::opt DisableByteCmp( +"disable-aarch64-lit-bytecmp", cl::Hidden, cl::init(false), +cl::desc("Proceed with AArch64 Loop Idiom Transform Pass, but do " + "not convert byte-compare loop(s).")); + +namespace llvm { + +void initializeAArch64LoopIdiomTransformLegacyPassPass(PassRegistry &); +Pass *createAArch64LoopIdiomTransformPass(); + +} // end namespace llvm + +namespace { + +class AArch64LoopIdiomTransform { + Loop *CurLoop = nullptr; + DominatorTree *DT; + LoopInfo *LI; + const TargetTransformInfo *TTI; + const DataLayout *DL; + +public: + explicit AArch64LoopIdiomTransform(DominatorTree *DT, LoopInfo *LI, + const TargetTransformInfo *TTI, + const DataLayout *DL) + : DT(DT), LI(LI), TTI(TTI), DL(DL) {} + + bool run(Loop *L); + +private: + /// \name Countable Loop Idiom Handling + /// @{ + + bool runOnCountableLoop(); + bool runOnLoopBlock(BasicBlock *BB, const SCEV *BECount, + SmallVectorImpl &ExitBlocks); + + bool recognizeByteCompare(); + Value *expandFindMismatch(IRBuilder<> &Builder, GetElementPtrInst *GEPA, +GetElementPtrInst *GEPB, Value *Start, +Value *MaxLen); + void transformByteCompare(GetElementPtrInst *GEPA, GetElementPtrInst *GEPB, +Value *MaxLen, Value *Index, Value *Start, +bool IncIdx, BasicBlock *FoundBB, +BasicBlock *EndBB); + /// @} +}; + +class AArch64LoopIdiomTransformLegacyPass : public LoopPass { +public: + static char ID; + + explicit AArch64LoopIdiomTransformLegacyPass() : LoopPass(ID) { +initializeAArch64LoopIdiomTransformLegacyPassPass( +*PassRegistry::getPassRegistry()); + } + + StringRef getPassName() const override { +return "Recognize AArch64-specific loop idioms"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { +AU.addRequired(); +AU.addRequired(); +AU.addRequired(); + } + + bool runOnLoop(Loop *L, LPPassManager &LPM) override; +}; + +bool AArch64LoopIdiomTransformLegacyPass::runOnLoop(Loop *L, +LPPassManager &LPM) { + + if (skipLoop(L)) +return false; + + auto *DT = &getAnalysis().getDomTree(); + auto *LI = &getAnalysis().getLoopInfo(); + auto &TTI = getAnalysis().getTTI( + *L->getHeader()->getParent()); + return AArch64LoopIdiomTransform( + DT, LI, &TTI, &L->getHeader()->getModule()->getDataLayout()) + .run(L); +} + +} // end anonymous namespace + +char AArch64LoopIdiomTransformLegacyPass::ID = 0; + +INITIALIZE_PASS_BEGIN( +AArch64LoopIdiomTransformLegacyPass, "aarch64-lit", +"Transform specific loop idioms into optimised vector forms", false, false) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(LoopSimplify) +INITIALIZE_PASS_DEPENDENCY(LCSSAWrapperPass) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) +INITIALIZE_PASS_END( +AArch64LoopIdiomTransformLegacyPass, "aarch64-lit", +"Transform specific loop idioms into optimised vector forms", false, false) + +Pass *llvm::createAArch64LoopIdiomTransformPass() { + return new AArch64LoopIdiomTransformLegacyPass(); +} + +PreservedAnalyses +AArch64LoopIdiomTransformPass::run(Loop &L, LoopAnalysisManager &AM, + LoopStandardAnalysisResults &AR, + LPMUpdater &) { + if (DisableAll) +return PreservedAnalyses::all(); + + const auto *DL = &L.getHeader()->getModule()->getDataLayout(); + + AArch64LoopIdiomTransform LIT(&AR.DT, &AR.LI, &AR.TTI, DL); + if (!LIT.run(&L)) +return PreservedAnalyses::all(); + + return PreservedAnalyses::none(); +} + +//===-
[clang-tools-extra] [clang] [llvm] [AArch64] Add an AArch64 pass for loop idiom transformations (PR #72273)
@@ -0,0 +1,726 @@ + +//===- AArch64LoopIdiomTransform.cpp - Loop idiom recognition -===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===--===// + +#include "AArch64LoopIdiomTransform.h" +#include "llvm/Analysis/DomTreeUpdater.h" +#include "llvm/Analysis/LoopPass.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/MDBuilder.h" +#include "llvm/IR/PatternMatch.h" +#include "llvm/InitializePasses.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" + +using namespace llvm; + +#define DEBUG_TYPE "aarch64-lit" + +static cl::opt +DisableAll("disable-aarch64-lit-all", cl::Hidden, cl::init(false), + cl::desc("Disable AArch64 Loop Idiom Transform Pass.")); + +static cl::opt DisableByteCmp( +"disable-aarch64-lit-bytecmp", cl::Hidden, cl::init(false), +cl::desc("Proceed with AArch64 Loop Idiom Transform Pass, but do " + "not convert byte-compare loop(s).")); + +namespace llvm { + +void initializeAArch64LoopIdiomTransformLegacyPassPass(PassRegistry &); +Pass *createAArch64LoopIdiomTransformPass(); + +} // end namespace llvm + +namespace { + +class AArch64LoopIdiomTransform { + Loop *CurLoop = nullptr; + DominatorTree *DT; + LoopInfo *LI; + const TargetTransformInfo *TTI; + const DataLayout *DL; + +public: + explicit AArch64LoopIdiomTransform(DominatorTree *DT, LoopInfo *LI, + const TargetTransformInfo *TTI, + const DataLayout *DL) + : DT(DT), LI(LI), TTI(TTI), DL(DL) {} + + bool run(Loop *L); + +private: + /// \name Countable Loop Idiom Handling + /// @{ + + bool runOnCountableLoop(); + bool runOnLoopBlock(BasicBlock *BB, const SCEV *BECount, + SmallVectorImpl &ExitBlocks); + + bool recognizeByteCompare(); + Value *expandFindMismatch(IRBuilder<> &Builder, GetElementPtrInst *GEPA, +GetElementPtrInst *GEPB, Value *Start, +Value *MaxLen); + void transformByteCompare(GetElementPtrInst *GEPA, GetElementPtrInst *GEPB, +Value *MaxLen, Value *Index, Value *Start, +bool IncIdx, BasicBlock *FoundBB, +BasicBlock *EndBB); + /// @} +}; + +class AArch64LoopIdiomTransformLegacyPass : public LoopPass { +public: + static char ID; + + explicit AArch64LoopIdiomTransformLegacyPass() : LoopPass(ID) { +initializeAArch64LoopIdiomTransformLegacyPassPass( +*PassRegistry::getPassRegistry()); + } + + StringRef getPassName() const override { +return "Recognize AArch64-specific loop idioms"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { +AU.addRequired(); +AU.addRequired(); +AU.addRequired(); + } + + bool runOnLoop(Loop *L, LPPassManager &LPM) override; +}; + +bool AArch64LoopIdiomTransformLegacyPass::runOnLoop(Loop *L, +LPPassManager &LPM) { + + if (skipLoop(L)) +return false; + + auto *DT = &getAnalysis().getDomTree(); + auto *LI = &getAnalysis().getLoopInfo(); + auto &TTI = getAnalysis().getTTI( + *L->getHeader()->getParent()); + return AArch64LoopIdiomTransform( + DT, LI, &TTI, &L->getHeader()->getModule()->getDataLayout()) + .run(L); +} + +} // end anonymous namespace + +char AArch64LoopIdiomTransformLegacyPass::ID = 0; + +INITIALIZE_PASS_BEGIN( +AArch64LoopIdiomTransformLegacyPass, "aarch64-lit", +"Transform specific loop idioms into optimised vector forms", false, false) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(LoopSimplify) +INITIALIZE_PASS_DEPENDENCY(LCSSAWrapperPass) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) +INITIALIZE_PASS_END( +AArch64LoopIdiomTransformLegacyPass, "aarch64-lit", +"Transform specific loop idioms into optimised vector forms", false, false) + +Pass *llvm::createAArch64LoopIdiomTransformPass() { + return new AArch64LoopIdiomTransformLegacyPass(); +} + +PreservedAnalyses +AArch64LoopIdiomTransformPass::run(Loop &L, LoopAnalysisManager &AM, + LoopStandardAnalysisResults &AR, + LPMUpdater &) { + if (DisableAll) +return PreservedAnalyses::all(); + + const auto *DL = &L.getHeader()->getModule()->getDataLayout(); + + AArch64LoopIdiomTransform LIT(&AR.DT, &AR.LI, &AR.TTI, DL); + if (!LIT.run(&L)) +return PreservedAnalyses::all(); + + return PreservedAnalyses::none(); +} + +//===-
[llvm] [clang] [Clang][SME2] Add builtins for moving multi-vectors to/from ZA (PR #71191)
https://github.com/david-arm approved this pull request. LGTM! I had one minor comment, but I won't hold up the patch for it. https://github.com/llvm/llvm-project/pull/71191 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [Clang][SME2] Add builtins for moving multi-vectors to/from ZA (PR #71191)
https://github.com/david-arm edited https://github.com/llvm/llvm-project/pull/71191 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [Clang][SME2] Add builtins for moving multi-vectors to/from ZA (PR #71191)
@@ -299,6 +299,44 @@ multiclass ZAAddSub { defm SVADD : ZAAddSub<"add">; defm SVSUB : ZAAddSub<"sub">; +// SME2 - MOVA + +// +// Single, 2 and 4 vector-group read/write intrinsics. +// + +multiclass ZAWrite_VG checks> { + def NAME # _VG2_H : Inst<"svwrite_hor_" # n # "_vg2", "vim2", t, MergeNone, i # "_hor_vg2", [IsSharedZA, IsStreaming], checks>; + def NAME # _VG2_V : Inst<"svwrite_ver_" # n # "_vg2", "vim2", t, MergeNone, i # "_ver_vg2", [IsSharedZA, IsStreaming], checks>; + def NAME # _VG4_H : Inst<"svwrite_hor_" # n # "_vg4", "vim4", t, MergeNone, i # "_hor_vg4", [IsSharedZA, IsStreaming], checks>; + def NAME # _VG4_V : Inst<"svwrite_ver_" # n # "_vg4", "vim4", t, MergeNone, i # "_ver_vg4", [IsSharedZA, IsStreaming], checks>; + def NAME # _VG1x2 : Inst<"svwrite_" # n # "_vg1x2", "vm2", t, MergeNone, i # "_vg1x2", [IsSharedZA, IsStreaming], []>; + def NAME # _VG1x4 : Inst<"svwrite_" # n # "_vg1x4", "vm4", t, MergeNone, i # "_vg1x4", [IsSharedZA, IsStreaming], []>; +} + +let TargetGuard = "sme2" in { + defm SVWRITE_ZA8 : ZAWrite_VG<"za8[_{d}]", "cUc", "aarch64_sme_write", [ImmCheck<0, ImmCheck0_0>]>; david-arm wrote: This is just a thought - is it worth pushing the `"[_{d}]"` bit into the multiclass given it's the same for each size, i.e. ``` def NAME # _VG2_H : Inst<"svwrite_hor_" # n # "[_{d}]_vg2", "vim2", t, MergeNone, i # "_hor_vg2", [IsSharedZA, IsStreaming], checks>; ``` and same question for the reads. https://github.com/llvm/llvm-project/pull/71191 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[llvm] [clang-tools-extra] [LoopVectorize] Enable hoisting of runtime checks by default (PR #71538)
https://github.com/david-arm closed https://github.com/llvm/llvm-project/pull/71538 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [Clang] Emit TBAA info for enums in C (PR #73326)
https://github.com/david-arm created https://github.com/llvm/llvm-project/pull/73326 When emitting TBAA information for enums in C code we currently just treat the data as an 'omnipotent char'. However, with C strict aliasing this means we fail to optimise certain cases. For example, in the SPEC2017 xz benchmark there are structs that contain arrays of enums, and clang pessmistically assumes that accesses to those enums could alias with other struct members that have a different type. According to https://en.cppreference.com/w/c/language/enum enums should be treated as 'int' types unless explicitly specified (C23) or if 'int' would not be large enough to hold all the enumerated values. In the latter case the compiler is free to choose a suitable integer that would hold all such values. When compiling C code this patch generates TBAA information for the enum by using an equivalent integer of the size clang has already chosen for the enum. I have ignored C++ for now because the rules are more complex. New test added here: clang/test/CodeGen/tbaa.c >From af76f6b6b3469fd0f5f24427c5a175c8d9d7c83a Mon Sep 17 00:00:00 2001 From: David Sherwood Date: Fri, 24 Nov 2023 13:20:23 + Subject: [PATCH] [Clang] Emit TBAA info for enums in C When emitting TBAA information for enums in C code we currently just treat the data as an 'omnipotent char'. However, with C strict aliasing this means we fail to optimise certain cases. For example, in the SPEC2017 xz benchmark there are structs that contain arrays of enums, and clang pessmistically assumes that accesses to those enums could alias with other struct members that have a different type. According to https://en.cppreference.com/w/c/language/enum enums should be treated as 'int' types unless explicitly specified (C23) or if 'int' would not be large enough to hold all the enumerated values. In the latter case the compiler is free to choose a suitable integer that would hold all such values. When compiling C code this patch generates TBAA information for the enum by using an equivalent integer of the size clang has already chosen for the enum. I have ignored C++ for now because the rules are more complex. New test added here: clang/test/CodeGen/tbaa.c --- clang/lib/CodeGen/CodeGenTBAA.cpp | 5 +- clang/test/CodeGen/tbaa.c | 116 ++ 2 files changed, 120 insertions(+), 1 deletion(-) create mode 100644 clang/test/CodeGen/tbaa.c diff --git a/clang/lib/CodeGen/CodeGenTBAA.cpp b/clang/lib/CodeGen/CodeGenTBAA.cpp index 8705d3d65f1a573..f59d3d422d5209d 100644 --- a/clang/lib/CodeGen/CodeGenTBAA.cpp +++ b/clang/lib/CodeGen/CodeGenTBAA.cpp @@ -196,11 +196,14 @@ llvm::MDNode *CodeGenTBAA::getTypeInfoHelper(const Type *Ty) { // Enum types are distinct types. In C++ they have "underlying types", // however they aren't related for TBAA. if (const EnumType *ETy = dyn_cast(Ty)) { +if (!Features.CPlusPlus) + return getTypeInfo(Context.getIntTypeForBitwidth(Size * 8, 0)); + // In C++ mode, types have linkage, so we can rely on the ODR and // on their mangled names, if they're external. // TODO: Is there a way to get a program-wide unique name for a // decl with local linkage or no linkage? -if (!Features.CPlusPlus || !ETy->getDecl()->isExternallyVisible()) +if (!ETy->getDecl()->isExternallyVisible()) return getChar(); SmallString<256> OutName; diff --git a/clang/test/CodeGen/tbaa.c b/clang/test/CodeGen/tbaa.c new file mode 100644 index 000..0ab81f60a71941c --- /dev/null +++ b/clang/test/CodeGen/tbaa.c @@ -0,0 +1,116 @@ +// RUN: %clang_cc1 -triple x86_64-apple-darwin -O1 -no-struct-path-tbaa -disable-llvm-passes %s -emit-llvm -o - | FileCheck %s +// RUN: %clang_cc1 -triple x86_64-apple-darwin -O1 -disable-llvm-passes %s -emit-llvm -o - | FileCheck %s -check-prefixes=PATH +// RUN: %clang_cc1 -triple x86_64-apple-darwin -O0 -disable-llvm-passes %s -emit-llvm -o - | FileCheck %s -check-prefix=NO-TBAA +// RUN: %clang_cc1 -triple x86_64-apple-darwin -O1 -relaxed-aliasing -disable-llvm-passes %s -emit-llvm -o - | FileCheck %s -check-prefix=NO-TBAA +// Test TBAA metadata generated by front-end. +// +// NO-TBAA-NOT: !tbaa + +typedef unsigned char uint8_t; +typedef unsigned short uint16_t; +typedef unsigned int uint32_t; +typedef unsigned long long uint64_t; + +typedef enum { + RED_AUTO_32, + GREEN_AUTO_32, + BLUE_AUTO_32 +} EnumAuto32; + +typedef enum { + RED_AUTO_64, + GREEN_AUTO_64, + BLUE_AUTO_64 = 0x1ull +} EnumAuto64; + +typedef enum : uint16_t { + RED_16, + GREEN_16, + BLUE_16 +} Enum16; + +typedef enum : uint8_t { + RED_8, + GREEN_8, + BLUE_8 +} Enum8; + +uint32_t g0(EnumAuto32 *E, uint32_t *val) { +// CHECK-LABEL: define{{.*}} i32 @g0( +// CHECK: store i32 5, ptr %{{.*}}, align 4, !tbaa [[TAG_i32:!.*]] +// CHECK: store i32 0, ptr %{{.*}}, align 4, !tbaa [[TAG_i32]] +// CHECK: load i32, ptr %{{.*}}, align 4, !tbaa [[TAG_i32]
[clang] [Clang] Emit TBAA info for enums in C (PR #73326)
https://github.com/david-arm edited https://github.com/llvm/llvm-project/pull/73326 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [Clang] Emit TBAA info for enums in C (PR #73326)
@@ -196,11 +196,14 @@ llvm::MDNode *CodeGenTBAA::getTypeInfoHelper(const Type *Ty) { // Enum types are distinct types. In C++ they have "underlying types", // however they aren't related for TBAA. if (const EnumType *ETy = dyn_cast(Ty)) { +if (!Features.CPlusPlus) + return getTypeInfo(Context.getIntTypeForBitwidth(Size * 8, 0)); david-arm wrote: I am not sure if this is entirely correct so would appreciate some guidance here! https://github.com/llvm/llvm-project/pull/73326 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [Clang] Emit TBAA info for enums in C (PR #73326)
https://github.com/david-arm updated https://github.com/llvm/llvm-project/pull/73326 >From af76f6b6b3469fd0f5f24427c5a175c8d9d7c83a Mon Sep 17 00:00:00 2001 From: David Sherwood Date: Fri, 24 Nov 2023 13:20:23 + Subject: [PATCH 1/2] [Clang] Emit TBAA info for enums in C When emitting TBAA information for enums in C code we currently just treat the data as an 'omnipotent char'. However, with C strict aliasing this means we fail to optimise certain cases. For example, in the SPEC2017 xz benchmark there are structs that contain arrays of enums, and clang pessmistically assumes that accesses to those enums could alias with other struct members that have a different type. According to https://en.cppreference.com/w/c/language/enum enums should be treated as 'int' types unless explicitly specified (C23) or if 'int' would not be large enough to hold all the enumerated values. In the latter case the compiler is free to choose a suitable integer that would hold all such values. When compiling C code this patch generates TBAA information for the enum by using an equivalent integer of the size clang has already chosen for the enum. I have ignored C++ for now because the rules are more complex. New test added here: clang/test/CodeGen/tbaa.c --- clang/lib/CodeGen/CodeGenTBAA.cpp | 5 +- clang/test/CodeGen/tbaa.c | 116 ++ 2 files changed, 120 insertions(+), 1 deletion(-) create mode 100644 clang/test/CodeGen/tbaa.c diff --git a/clang/lib/CodeGen/CodeGenTBAA.cpp b/clang/lib/CodeGen/CodeGenTBAA.cpp index 8705d3d65f1a573..f59d3d422d5209d 100644 --- a/clang/lib/CodeGen/CodeGenTBAA.cpp +++ b/clang/lib/CodeGen/CodeGenTBAA.cpp @@ -196,11 +196,14 @@ llvm::MDNode *CodeGenTBAA::getTypeInfoHelper(const Type *Ty) { // Enum types are distinct types. In C++ they have "underlying types", // however they aren't related for TBAA. if (const EnumType *ETy = dyn_cast(Ty)) { +if (!Features.CPlusPlus) + return getTypeInfo(Context.getIntTypeForBitwidth(Size * 8, 0)); + // In C++ mode, types have linkage, so we can rely on the ODR and // on their mangled names, if they're external. // TODO: Is there a way to get a program-wide unique name for a // decl with local linkage or no linkage? -if (!Features.CPlusPlus || !ETy->getDecl()->isExternallyVisible()) +if (!ETy->getDecl()->isExternallyVisible()) return getChar(); SmallString<256> OutName; diff --git a/clang/test/CodeGen/tbaa.c b/clang/test/CodeGen/tbaa.c new file mode 100644 index 000..0ab81f60a71941c --- /dev/null +++ b/clang/test/CodeGen/tbaa.c @@ -0,0 +1,116 @@ +// RUN: %clang_cc1 -triple x86_64-apple-darwin -O1 -no-struct-path-tbaa -disable-llvm-passes %s -emit-llvm -o - | FileCheck %s +// RUN: %clang_cc1 -triple x86_64-apple-darwin -O1 -disable-llvm-passes %s -emit-llvm -o - | FileCheck %s -check-prefixes=PATH +// RUN: %clang_cc1 -triple x86_64-apple-darwin -O0 -disable-llvm-passes %s -emit-llvm -o - | FileCheck %s -check-prefix=NO-TBAA +// RUN: %clang_cc1 -triple x86_64-apple-darwin -O1 -relaxed-aliasing -disable-llvm-passes %s -emit-llvm -o - | FileCheck %s -check-prefix=NO-TBAA +// Test TBAA metadata generated by front-end. +// +// NO-TBAA-NOT: !tbaa + +typedef unsigned char uint8_t; +typedef unsigned short uint16_t; +typedef unsigned int uint32_t; +typedef unsigned long long uint64_t; + +typedef enum { + RED_AUTO_32, + GREEN_AUTO_32, + BLUE_AUTO_32 +} EnumAuto32; + +typedef enum { + RED_AUTO_64, + GREEN_AUTO_64, + BLUE_AUTO_64 = 0x1ull +} EnumAuto64; + +typedef enum : uint16_t { + RED_16, + GREEN_16, + BLUE_16 +} Enum16; + +typedef enum : uint8_t { + RED_8, + GREEN_8, + BLUE_8 +} Enum8; + +uint32_t g0(EnumAuto32 *E, uint32_t *val) { +// CHECK-LABEL: define{{.*}} i32 @g0( +// CHECK: store i32 5, ptr %{{.*}}, align 4, !tbaa [[TAG_i32:!.*]] +// CHECK: store i32 0, ptr %{{.*}}, align 4, !tbaa [[TAG_i32]] +// CHECK: load i32, ptr %{{.*}}, align 4, !tbaa [[TAG_i32]] +// PATH-LABEL: define{{.*}} i32 @g0( +// PATH: store i32 5, ptr %{{.*}}, align 4, !tbaa [[TAG_i32:!.*]] +// PATH: store i32 0, ptr %{{.*}}, align 4, !tbaa [[TAG_i32]] +// PATH: load i32, ptr %{{.*}}, align 4, !tbaa [[TAG_i32]] + *val = 5; + *E = RED_AUTO_32; + return *val; +} + +uint64_t g1(EnumAuto64 *E, uint64_t *val) { +// CHECK-LABEL: define{{.*}} i64 @g1( +// CHECK: store i64 5, ptr %{{.*}}, align 8, !tbaa [[TAG_i64:!.*]] +// CHECK: store i64 0, ptr %{{.*}}, align 8, !tbaa [[TAG_long:!.*]] +// CHECK: load i64, ptr %{{.*}}, align 8, !tbaa [[TAG_i64]] +// PATH-LABEL: define{{.*}} i64 @g1( +// PATH: store i64 5, ptr %{{.*}}, align 8, !tbaa [[TAG_i64:!.*]] +// PATH: store i64 0, ptr %{{.*}}, align 8, !tbaa [[TAG_long:!.*]] +// PATH: load i64, ptr %{{.*}}, align 8, !tbaa [[TAG_i64]] + *val = 5; + *E = RED_AUTO_64; + return *val; +} + +uint16_t g2(Enum16 *E, uint16_t *val) { +// CHECK-LABEL: define{{.*}} i16 @g2( +// CHECK: store i16 5, ptr %{{.*}}, align 2, !tbaa [[TA
[clang] [Clang] Emit TBAA info for enums in C (PR #73326)
@@ -196,11 +196,14 @@ llvm::MDNode *CodeGenTBAA::getTypeInfoHelper(const Type *Ty) { // Enum types are distinct types. In C++ they have "underlying types", // however they aren't related for TBAA. if (const EnumType *ETy = dyn_cast(Ty)) { +if (!Features.CPlusPlus) + return getTypeInfo(Context.getIntTypeForBitwidth(Size * 8, 0)); david-arm wrote: Good suggestion - thanks! https://github.com/llvm/llvm-project/pull/73326 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [AArch64][SME2] Add multi-vector SEL (x2, x4) ACLE builtins & intrinsics (PR #73188)
@@ -0,0 +1,384 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target + +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -target-feature -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED) A1 +#else +#define SVE_ACLE_FUNC(A1,A2) A1##A2 +#endif + +// 8-bit ZIPs david-arm wrote: I think this comment should say "8-bit SELs" and similarly for all the other comments in both the selx2 and selx4 files. https://github.com/llvm/llvm-project/pull/73188 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [AArch64][SME2] Add multi-vector SEL (x2, x4) ACLE builtins & intrinsics (PR #73188)
david-arm wrote: Should the file be renamed to acle_sme2_vector_selx4? This would make it consistent with the existing acle_sme2_vector_add.c file, which also has SVE-like instructions that only operate on SVE vectors. https://github.com/llvm/llvm-project/pull/73188 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [AArch64][SME2] Add multi-vector SEL (x2, x4) ACLE builtins & intrinsics (PR #73188)
david-arm wrote: Should the file be renamed to acle_sme2_vector_selx2? This would make it consistent with the existing acle_sme2_vector_add.c file, which also has SVE-like instructions that only operate on SVE vectors. https://github.com/llvm/llvm-project/pull/73188 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [AArch64][SME2] Add multi-vector SEL (x2, x4) ACLE builtins & intrinsics (PR #73188)
https://github.com/david-arm approved this pull request. LGTM! Thanks for the changes. :) https://github.com/llvm/llvm-project/pull/73188 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang-tools-extra] [clang] [llvm] [LoopVectorize] Improve algorithm for hoisting runtime checks (PR #73515)
https://github.com/david-arm updated https://github.com/llvm/llvm-project/pull/73515 >From 30251642f8c208c63f3f3097c337ef0d5bc633b5 Mon Sep 17 00:00:00 2001 From: David Sherwood Date: Mon, 27 Nov 2023 13:43:26 + Subject: [PATCH 1/3] [LoopVectorize] Improve algorithm for hoisting runtime checks When attempting to hoist runtime checks out of a loop we currently avoid creating pointer diff checks and prefer to do expanded range checks instead. This gives us the opportunity to hoist runtime checks out of a loop, since these checks are loop invariant. However, in some cases the pointer diff checks would also be loop invariant and so will naturally get hoisted. Therefore, since diff checks are cheaper so we should prefer to use those instead. --- llvm/lib/Analysis/LoopAccessAnalysis.cpp | 5 +- .../LoopVectorize/runtime-checks-hoist.ll | 143 ++ 2 files changed, 121 insertions(+), 27 deletions(-) diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp index 3d1edd5f038a25e..057652233979876 100644 --- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp +++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp @@ -346,7 +346,10 @@ void RuntimePointerChecking::tryToCreateDiffCheck( auto *SinkStartAR = cast(SinkStartInt); const Loop *StartARLoop = SrcStartAR->getLoop(); if (StartARLoop == SinkStartAR->getLoop() && -StartARLoop == InnerLoop->getParentLoop()) { +StartARLoop == InnerLoop->getParentLoop() && +!SE->isKnownPredicate(ICmpInst::ICMP_EQ, + SrcStartAR->getStepRecurrence(*SE), + SinkStartAR->getStepRecurrence(*SE))) { LLVM_DEBUG(dbgs() << "LAA: Not creating diff runtime check, since these " "cannot be hoisted out of the outer loop\n"); CanUseDiffCheck = false; diff --git a/llvm/test/Transforms/LoopVectorize/runtime-checks-hoist.ll b/llvm/test/Transforms/LoopVectorize/runtime-checks-hoist.ll index 891597cbdc48a8f..81702bf34e96bed 100644 --- a/llvm/test/Transforms/LoopVectorize/runtime-checks-hoist.ll +++ b/llvm/test/Transforms/LoopVectorize/runtime-checks-hoist.ll @@ -69,11 +69,11 @@ define void @diff_checks(ptr nocapture noundef writeonly %dst, ptr nocapture nou ; CHECK-NEXT:[[TMP14:%.*]] = add nuw nsw i64 [[TMP13]], [[TMP10]] ; CHECK-NEXT:[[TMP15:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP14]] ; CHECK-NEXT:[[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i32 0 -; CHECK-NEXT:[[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP16]], align 4, !alias.scope !0 +; CHECK-NEXT:[[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP16]], align 4, !alias.scope [[META0:![0-9]+]] ; CHECK-NEXT:[[TMP17:%.*]] = add nsw i64 [[TMP13]], [[TMP11]] ; CHECK-NEXT:[[TMP18:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP17]] ; CHECK-NEXT:[[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP18]], i32 0 -; CHECK-NEXT:store <4 x i32> [[WIDE_LOAD]], ptr [[TMP19]], align 4, !alias.scope !3, !noalias !0 +; CHECK-NEXT:store <4 x i32> [[WIDE_LOAD]], ptr [[TMP19]], align 4, !alias.scope [[META3:![0-9]+]], !noalias [[META0]] ; CHECK-NEXT:[[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT:[[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT:br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] @@ -189,12 +189,12 @@ define void @full_checks(ptr nocapture noundef %dst, ptr nocapture noundef reado ; CHECK-NEXT:[[TMP5:%.*]] = add nuw nsw i64 [[TMP4]], [[TMP3]] ; CHECK-NEXT:[[TMP6:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP5]] ; CHECK-NEXT:[[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0 -; CHECK-NEXT:[[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP7]], align 4, !alias.scope !9 +; CHECK-NEXT:[[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP7]], align 4, !alias.scope [[META9:![0-9]+]] ; CHECK-NEXT:[[TMP8:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP5]] ; CHECK-NEXT:[[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0 -; CHECK-NEXT:[[WIDE_LOAD2:%.*]] = load <4 x i32>, ptr [[TMP9]], align 4, !alias.scope !12, !noalias !9 +; CHECK-NEXT:[[WIDE_LOAD2:%.*]] = load <4 x i32>, ptr [[TMP9]], align 4, !alias.scope [[META12:![0-9]+]], !noalias [[META9]] ; CHECK-NEXT:[[TMP10:%.*]] = add nsw <4 x i32> [[WIDE_LOAD2]], [[WIDE_LOAD]] -; CHECK-NEXT:store <4 x i32> [[TMP10]], ptr [[TMP9]], align 4, !alias.scope !12, !noalias !9 +; CHECK-NEXT:store <4 x i32> [[TMP10]], ptr [[TMP9]], align 4, !alias.scope [[META12]], !noalias [[META9]] ; CHECK-NEXT:[[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT:[[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT:br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] @@ -319,13 +319,13 @@ define void @ful
[clang] [llvm] [SME2] Add LUTI2 and LUTI4 quad Builtins and Intrinsics (PR #73317)
@@ -1859,6 +1867,34 @@ void AArch64DAGToDAGISel::SelectFrintFromVT(SDNode *N, unsigned NumVecs, SelectUnaryMultiIntrinsic(N, NumVecs, true, Opcode); } +template david-arm wrote: Rather than create two almost identical copies of the function with a template parameter, I think in this case it makes sense to just pass Max in as a function argument. https://github.com/llvm/llvm-project/pull/73317 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[llvm] [clang] [SME2] Add LUTI2 and LUTI4 quad Builtins and Intrinsics (PR #73317)
@@ -1859,6 +1867,34 @@ void AArch64DAGToDAGISel::SelectFrintFromVT(SDNode *N, unsigned NumVecs, SelectUnaryMultiIntrinsic(N, NumVecs, true, Opcode); } +template +void AArch64DAGToDAGISel::SelectMultiVectorLuti(SDNode *Node, +unsigned NumOutVecs, +unsigned Opc) { + if (ConstantSDNode *Imm = dyn_cast(Node->getOperand(4))) +if (Imm->getZExtValue() > Max) + return; + + SDValue ZtValue; + ImmToTile(Node->getOperand(2), ZtValue); david-arm wrote: If someone invokes the intrinsic with Op2 != 0 this will likely crash. Is it worth asserting the result of ImmToTile is true so that at least it's more obvious? https://github.com/llvm/llvm-project/pull/73317 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [SME2] Add LUTI2 and LUTI4 quad Builtins and Intrinsics (PR #73317)
@@ -0,0 +1,280 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py + +// REQUIRES: aarch64-registered-target + +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s + +#include + +// CHECK-LABEL: @test_svluti2_lane_zt_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT:[[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.luti2.lane.zt.x4.nxv16i8(i32 0, [[ZN:%.*]], i32 0) +// CHECK-NEXT:[[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT:[[TMP2:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP1]], i64 0) +// CHECK-NEXT:[[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT:[[TMP4:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CHECK-NEXT:[[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT:[[TMP6:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP4]], [[TMP5]], i64 32) +// CHECK-NEXT:[[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT:[[TMP8:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP6]], [[TMP7]], i64 48) +// CHECK-NEXT:ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z23test_svluti2_lane_zt_u8u11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT:[[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.luti2.lane.zt.x4.nxv16i8(i32 0, [[ZN:%.*]], i32 0) +// CPP-CHECK-NEXT:[[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT:[[TMP2:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT:[[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT:[[TMP4:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CPP-CHECK-NEXT:[[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT:[[TMP6:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP4]], [[TMP5]], i64 32) +// CPP-CHECK-NEXT:[[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT:[[TMP8:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP6]], [[TMP7]], i64 48) +// CPP-CHECK-NEXT:ret [[TMP8]] +// +svuint8x4_t test_svluti2_lane_zt_u8(svuint8_t zn) __arm_streaming __arm_shared_za __arm_preserves_za { david-arm wrote: For all of the functions in both test files shouldn't we also be testing the overloaded forms of the builtins? I'd expected to see 5 RUN lines in total for each file https://github.com/llvm/llvm-project/pull/73317 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AArch64][SME2] Add ldr_zt, str_zt builtins and intrinsics (PR #72849)
@@ -0,0 +1,51 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py + +// REQUIRES: aarch64-registered-target + +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s + +#include + +#ifdef SVE_OVERLOADED_FORMS david-arm wrote: Can delete these lines. https://github.com/llvm/llvm-project/pull/72849 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AArch64][SME2] Add ldr_zt, str_zt builtins and intrinsics (PR #72849)
@@ -0,0 +1,51 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py + +// REQUIRES: aarch64-registered-target + +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s david-arm wrote: I think in this case we can kill off the RUN lines for the overloaded forms because in the ACLE they are never overloaded. https://github.com/llvm/llvm-project/pull/72849 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AArch64][SME2] Add ldr_zt, str_zt builtins and intrinsics (PR #72849)
@@ -2748,6 +2748,22 @@ AArch64TargetLowering::EmitFill(MachineInstr &MI, MachineBasicBlock *BB) const { return BB; } +MachineBasicBlock *AArch64TargetLowering::EmitZTSpillFill(MachineInstr &MI, + MachineBasicBlock *BB, + bool IsSpill) const { + const TargetInstrInfo *TII = Subtarget->getInstrInfo(); + MachineInstrBuilder MIB; + if (IsSpill) { david-arm wrote: I think this can be simplified to ``` unsigned Opc = IsSpill ? AArch64::STR_TX : AArch64::LDR_TX; MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opc)); MIB.addReg(MI.getOperand(0).getReg()); MIB.add(MI.getOperand(1)); // Base ``` https://github.com/llvm/llvm-project/pull/72849 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AArch64][SME2] Add ldr_zt, str_zt builtins and intrinsics (PR #72849)
david-arm wrote: It looks like a few other pull requests are changing the same code around ImmToTile. Might be good to land this smaller patch first so you can rebase the others and reduce the diffs! https://github.com/llvm/llvm-project/pull/72849 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[llvm] [clang] [AArch64][SME2] Add ldr_zt, str_zt builtins and intrinsics (PR #72849)
https://github.com/david-arm approved this pull request. LGTM! C'est parfait! https://github.com/llvm/llvm-project/pull/72849 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[llvm] [clang] [SME2] Add LUTI2 and LUTI4 quad Builtins and Intrinsics (PR #73317)
@@ -0,0 +1,280 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py + +// REQUIRES: aarch64-registered-target + +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s + +#include + +// CHECK-LABEL: @test_svluti2_lane_zt_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT:[[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.luti2.lane.zt.x4.nxv16i8(i32 0, [[ZN:%.*]], i32 0) +// CHECK-NEXT:[[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT:[[TMP2:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP1]], i64 0) +// CHECK-NEXT:[[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT:[[TMP4:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CHECK-NEXT:[[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT:[[TMP6:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP4]], [[TMP5]], i64 32) +// CHECK-NEXT:[[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT:[[TMP8:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP6]], [[TMP7]], i64 48) +// CHECK-NEXT:ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z23test_svluti2_lane_zt_u8u11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT:[[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.luti2.lane.zt.x4.nxv16i8(i32 0, [[ZN:%.*]], i32 0) +// CPP-CHECK-NEXT:[[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT:[[TMP2:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT:[[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT:[[TMP4:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CPP-CHECK-NEXT:[[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT:[[TMP6:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP4]], [[TMP5]], i64 32) +// CPP-CHECK-NEXT:[[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT:[[TMP8:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP6]], [[TMP7]], i64 48) +// CPP-CHECK-NEXT:ret [[TMP8]] +// +svuint8x4_t test_svluti2_lane_zt_u8(svuint8_t zn) __arm_streaming __arm_shared_za __arm_preserves_za { david-arm wrote: OK that's fair enough! https://github.com/llvm/llvm-project/pull/73317 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [SME2] Add LUTI2 and LUTI4 quad Builtins and Intrinsics (PR #73317)
https://github.com/david-arm commented: This looks good to me, but I think it needs rebasing after https://github.com/llvm/llvm-project/pull/72849 landed. It also looks like @sdesmalen-arm left a comment about renaming ImmToTile - perhaps that could be done in this patch? https://github.com/llvm/llvm-project/pull/73317 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [AArch64][SME2] Remove IsPreservesZA from ldr_zt builtin (PR #74303)
@@ -319,7 +319,7 @@ let TargetGuard = "sme2" in { // Spill and fill of ZT0 // let TargetGuard = "sme2" in { - def SVLDR_ZT : Inst<"svldr_zt", "viQ", "", MergeNone, "aarch64_sme_ldr_zt", [IsOverloadNone, IsStreamingCompatible, IsSharedZA, IsPreservesZA], [ImmCheck<0, ImmCheck0_0>]>; + def SVLDR_ZT : Inst<"svldr_zt", "viQ", "", MergeNone, "aarch64_sme_ldr_zt", [IsOverloadNone, IsStreamingCompatible, IsSharedZA], [ImmCheck<0, ImmCheck0_0>]>; def SVSTR_ZT : Inst<"svstr_zt", "vi%", "", MergeNone, "aarch64_sme_str_zt", [IsOverloadNone, IsStreamingCompatible, IsSharedZA, IsPreservesZA], [ImmCheck<0, ImmCheck0_0>]>; david-arm wrote: Does the STR also need changing? https://github.com/llvm/llvm-project/pull/74303 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [Clang] Emit TBAA info for enums in C (PR #73326)
david-arm wrote: Gentle ping! https://github.com/llvm/llvm-project/pull/73326 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [Clang][SME2] Add builtins for multi-vector fp round to integral value (PR #75941)
https://github.com/david-arm approved this pull request. LGTM. Absolute perfection! https://github.com/llvm/llvm-project/pull/75941 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[llvm] [clang-tools-extra] [clang] [Clang][SME2] Enable multi-vector loads & stores for SME2 (PR #75821)
https://github.com/david-arm approved this pull request. LGTM! A lovely patch. :) https://github.com/llvm/llvm-project/pull/75821 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [clang-tools-extra] [AArch64] Add an AArch64 pass for loop idiom transformations (PR #72273)
@@ -0,0 +1,816 @@ +//===- AArch64LoopIdiomTransform.cpp - Loop idiom recognition -===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===--===// +// +// This pass implements a pass that recognizes certain loop idioms and +// transforms them into more optimized versions of the same loop. In cases +// where this happens, it can be a significant performance win. +// +// We currently only recognize one loop that finds the first mismatched byte +// in an array and returns the index, i.e. something like: +// +// while (++i != n) { +//if (a[i] != b[i]) +// break; +// } +// +// In this example we can actually vectorize the loop despite the early exit, +// although the loop vectorizer does not support it. It requires some extra +// checks to deal with the possibility of faulting loads when crossing page +// boundaries. However, even with these checks it is still profitable to do the +// transformation. +// +//===--===// +// +// TODO List: +// +// * When optimizing for code size we may want to avoid some transformations. +// * We can also support the inverse case where we scan for a matching element. +// +//===--===// + +#include "AArch64LoopIdiomTransform.h" +#include "llvm/Analysis/DomTreeUpdater.h" +#include "llvm/Analysis/LoopPass.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/MDBuilder.h" +#include "llvm/IR/PatternMatch.h" +#include "llvm/InitializePasses.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" + +using namespace llvm; + +#define DEBUG_TYPE "aarch64-loop-idiom-transform" + +static cl::opt +DisableAll("disable-aarch64-lit-all", cl::Hidden, cl::init(false), + cl::desc("Disable AArch64 Loop Idiom Transform Pass.")); + +static cl::opt DisableByteCmp( +"disable-aarch64-lit-bytecmp", cl::Hidden, cl::init(false), +cl::desc("Proceed with AArch64 Loop Idiom Transform Pass, but do " + "not convert byte-compare loop(s).")); + +static cl::opt VerifyLoops( +"aarch64-lit-verify", cl::Hidden, cl::init(false), +cl::desc("Verify loops generated AArch64 Loop Idiom Transform Pass.")); + +namespace llvm { + +void initializeAArch64LoopIdiomTransformLegacyPassPass(PassRegistry &); +Pass *createAArch64LoopIdiomTransformPass(); + +} // end namespace llvm + +namespace { + +class AArch64LoopIdiomTransform { + Loop *CurLoop = nullptr; + DominatorTree *DT; + LoopInfo *LI; + const TargetTransformInfo *TTI; + const DataLayout *DL; + +public: + explicit AArch64LoopIdiomTransform(DominatorTree *DT, LoopInfo *LI, + const TargetTransformInfo *TTI, + const DataLayout *DL) + : DT(DT), LI(LI), TTI(TTI), DL(DL) {} + + bool run(Loop *L); + +private: + /// \name Countable Loop Idiom Handling + /// @{ + + bool runOnCountableLoop(); + bool runOnLoopBlock(BasicBlock *BB, const SCEV *BECount, + SmallVectorImpl &ExitBlocks); + + bool recognizeByteCompare(); + Value *expandFindMismatch(IRBuilder<> &Builder, GetElementPtrInst *GEPA, +GetElementPtrInst *GEPB, Instruction *Index, +Value *Start, Value *MaxLen); + void transformByteCompare(GetElementPtrInst *GEPA, GetElementPtrInst *GEPB, +PHINode *IndPhi, Value *MaxLen, Instruction *Index, +Value *Start, bool IncIdx, BasicBlock *FoundBB, +BasicBlock *EndBB); + /// @} +}; + +class AArch64LoopIdiomTransformLegacyPass : public LoopPass { +public: + static char ID; + + explicit AArch64LoopIdiomTransformLegacyPass() : LoopPass(ID) { +initializeAArch64LoopIdiomTransformLegacyPassPass( +*PassRegistry::getPassRegistry()); + } + + StringRef getPassName() const override { +return "Transform AArch64-specific loop idioms"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { +AU.addRequired(); +AU.addRequired(); +AU.addRequired(); + } + + bool runOnLoop(Loop *L, LPPassManager &LPM) override; +}; + +bool AArch64LoopIdiomTransformLegacyPass::runOnLoop(Loop *L, +LPPassManager &LPM) { + + if (skipLoop(L)) +return false; + + auto *DT = &getAnalysis().getDomTree(); + auto *LI = &getAnalysis().getLoopInfo(); + auto &TTI = getAnalysis().getTTI( + *L->getHeader()->getParent()); + return AArch64LoopIdiomTransform( + DT, LI, &TTI, &L->getHeader()->getModule()->getDataLayout
[clang] [llvm] [clang-tools-extra] [AArch64] Add an AArch64 pass for loop idiom transformations (PR #72273)
@@ -0,0 +1,816 @@ +//===- AArch64LoopIdiomTransform.cpp - Loop idiom recognition -===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===--===// +// +// This pass implements a pass that recognizes certain loop idioms and +// transforms them into more optimized versions of the same loop. In cases +// where this happens, it can be a significant performance win. +// +// We currently only recognize one loop that finds the first mismatched byte +// in an array and returns the index, i.e. something like: +// +// while (++i != n) { +//if (a[i] != b[i]) +// break; +// } +// +// In this example we can actually vectorize the loop despite the early exit, +// although the loop vectorizer does not support it. It requires some extra +// checks to deal with the possibility of faulting loads when crossing page +// boundaries. However, even with these checks it is still profitable to do the +// transformation. +// +//===--===// +// +// TODO List: +// +// * When optimizing for code size we may want to avoid some transformations. +// * We can also support the inverse case where we scan for a matching element. +// +//===--===// + +#include "AArch64LoopIdiomTransform.h" +#include "llvm/Analysis/DomTreeUpdater.h" +#include "llvm/Analysis/LoopPass.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/MDBuilder.h" +#include "llvm/IR/PatternMatch.h" +#include "llvm/InitializePasses.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" + +using namespace llvm; + +#define DEBUG_TYPE "aarch64-loop-idiom-transform" + +static cl::opt +DisableAll("disable-aarch64-lit-all", cl::Hidden, cl::init(false), + cl::desc("Disable AArch64 Loop Idiom Transform Pass.")); + +static cl::opt DisableByteCmp( +"disable-aarch64-lit-bytecmp", cl::Hidden, cl::init(false), +cl::desc("Proceed with AArch64 Loop Idiom Transform Pass, but do " + "not convert byte-compare loop(s).")); + +static cl::opt VerifyLoops( +"aarch64-lit-verify", cl::Hidden, cl::init(false), +cl::desc("Verify loops generated AArch64 Loop Idiom Transform Pass.")); + +namespace llvm { + +void initializeAArch64LoopIdiomTransformLegacyPassPass(PassRegistry &); +Pass *createAArch64LoopIdiomTransformPass(); + +} // end namespace llvm + +namespace { + +class AArch64LoopIdiomTransform { + Loop *CurLoop = nullptr; + DominatorTree *DT; + LoopInfo *LI; + const TargetTransformInfo *TTI; + const DataLayout *DL; + +public: + explicit AArch64LoopIdiomTransform(DominatorTree *DT, LoopInfo *LI, + const TargetTransformInfo *TTI, + const DataLayout *DL) + : DT(DT), LI(LI), TTI(TTI), DL(DL) {} + + bool run(Loop *L); + +private: + /// \name Countable Loop Idiom Handling + /// @{ + + bool runOnCountableLoop(); + bool runOnLoopBlock(BasicBlock *BB, const SCEV *BECount, + SmallVectorImpl &ExitBlocks); + + bool recognizeByteCompare(); + Value *expandFindMismatch(IRBuilder<> &Builder, GetElementPtrInst *GEPA, +GetElementPtrInst *GEPB, Instruction *Index, +Value *Start, Value *MaxLen); + void transformByteCompare(GetElementPtrInst *GEPA, GetElementPtrInst *GEPB, +PHINode *IndPhi, Value *MaxLen, Instruction *Index, +Value *Start, bool IncIdx, BasicBlock *FoundBB, +BasicBlock *EndBB); + /// @} +}; + +class AArch64LoopIdiomTransformLegacyPass : public LoopPass { +public: + static char ID; + + explicit AArch64LoopIdiomTransformLegacyPass() : LoopPass(ID) { +initializeAArch64LoopIdiomTransformLegacyPassPass( +*PassRegistry::getPassRegistry()); + } + + StringRef getPassName() const override { +return "Transform AArch64-specific loop idioms"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { +AU.addRequired(); +AU.addRequired(); +AU.addRequired(); + } + + bool runOnLoop(Loop *L, LPPassManager &LPM) override; +}; + +bool AArch64LoopIdiomTransformLegacyPass::runOnLoop(Loop *L, +LPPassManager &LPM) { + + if (skipLoop(L)) +return false; + + auto *DT = &getAnalysis().getDomTree(); + auto *LI = &getAnalysis().getLoopInfo(); + auto &TTI = getAnalysis().getTTI( + *L->getHeader()->getParent()); + return AArch64LoopIdiomTransform( + DT, LI, &TTI, &L->getHeader()->getModule()->getDataLayout
[clang] [llvm] [clang-tools-extra] [AArch64] Add an AArch64 pass for loop idiom transformations (PR #72273)
@@ -0,0 +1,816 @@ +//===- AArch64LoopIdiomTransform.cpp - Loop idiom recognition -===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===--===// +// +// This pass implements a pass that recognizes certain loop idioms and +// transforms them into more optimized versions of the same loop. In cases +// where this happens, it can be a significant performance win. +// +// We currently only recognize one loop that finds the first mismatched byte +// in an array and returns the index, i.e. something like: +// +// while (++i != n) { +//if (a[i] != b[i]) +// break; +// } +// +// In this example we can actually vectorize the loop despite the early exit, +// although the loop vectorizer does not support it. It requires some extra +// checks to deal with the possibility of faulting loads when crossing page +// boundaries. However, even with these checks it is still profitable to do the +// transformation. +// +//===--===// +// +// TODO List: +// +// * When optimizing for code size we may want to avoid some transformations. +// * We can also support the inverse case where we scan for a matching element. +// +//===--===// + +#include "AArch64LoopIdiomTransform.h" +#include "llvm/Analysis/DomTreeUpdater.h" +#include "llvm/Analysis/LoopPass.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/MDBuilder.h" +#include "llvm/IR/PatternMatch.h" +#include "llvm/InitializePasses.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" + +using namespace llvm; + +#define DEBUG_TYPE "aarch64-loop-idiom-transform" + +static cl::opt +DisableAll("disable-aarch64-lit-all", cl::Hidden, cl::init(false), + cl::desc("Disable AArch64 Loop Idiom Transform Pass.")); + +static cl::opt DisableByteCmp( +"disable-aarch64-lit-bytecmp", cl::Hidden, cl::init(false), +cl::desc("Proceed with AArch64 Loop Idiom Transform Pass, but do " + "not convert byte-compare loop(s).")); + +static cl::opt VerifyLoops( +"aarch64-lit-verify", cl::Hidden, cl::init(false), +cl::desc("Verify loops generated AArch64 Loop Idiom Transform Pass.")); + +namespace llvm { + +void initializeAArch64LoopIdiomTransformLegacyPassPass(PassRegistry &); +Pass *createAArch64LoopIdiomTransformPass(); + +} // end namespace llvm + +namespace { + +class AArch64LoopIdiomTransform { + Loop *CurLoop = nullptr; + DominatorTree *DT; + LoopInfo *LI; + const TargetTransformInfo *TTI; + const DataLayout *DL; + +public: + explicit AArch64LoopIdiomTransform(DominatorTree *DT, LoopInfo *LI, + const TargetTransformInfo *TTI, + const DataLayout *DL) + : DT(DT), LI(LI), TTI(TTI), DL(DL) {} + + bool run(Loop *L); + +private: + /// \name Countable Loop Idiom Handling + /// @{ + + bool runOnCountableLoop(); + bool runOnLoopBlock(BasicBlock *BB, const SCEV *BECount, + SmallVectorImpl &ExitBlocks); + + bool recognizeByteCompare(); + Value *expandFindMismatch(IRBuilder<> &Builder, GetElementPtrInst *GEPA, +GetElementPtrInst *GEPB, Instruction *Index, +Value *Start, Value *MaxLen); + void transformByteCompare(GetElementPtrInst *GEPA, GetElementPtrInst *GEPB, +PHINode *IndPhi, Value *MaxLen, Instruction *Index, +Value *Start, bool IncIdx, BasicBlock *FoundBB, +BasicBlock *EndBB); + /// @} +}; + +class AArch64LoopIdiomTransformLegacyPass : public LoopPass { +public: + static char ID; + + explicit AArch64LoopIdiomTransformLegacyPass() : LoopPass(ID) { +initializeAArch64LoopIdiomTransformLegacyPassPass( +*PassRegistry::getPassRegistry()); + } + + StringRef getPassName() const override { +return "Transform AArch64-specific loop idioms"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { +AU.addRequired(); +AU.addRequired(); +AU.addRequired(); + } + + bool runOnLoop(Loop *L, LPPassManager &LPM) override; +}; + +bool AArch64LoopIdiomTransformLegacyPass::runOnLoop(Loop *L, +LPPassManager &LPM) { + + if (skipLoop(L)) +return false; + + auto *DT = &getAnalysis().getDomTree(); + auto *LI = &getAnalysis().getLoopInfo(); + auto &TTI = getAnalysis().getTTI( + *L->getHeader()->getParent()); + return AArch64LoopIdiomTransform( + DT, LI, &TTI, &L->getHeader()->getModule()->getDataLayout
[clang-tools-extra] [llvm] [clang] [AArch64] Add an AArch64 pass for loop idiom transformations (PR #72273)
@@ -0,0 +1,816 @@ +//===- AArch64LoopIdiomTransform.cpp - Loop idiom recognition -===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===--===// +// +// This pass implements a pass that recognizes certain loop idioms and +// transforms them into more optimized versions of the same loop. In cases +// where this happens, it can be a significant performance win. +// +// We currently only recognize one loop that finds the first mismatched byte +// in an array and returns the index, i.e. something like: +// +// while (++i != n) { +//if (a[i] != b[i]) +// break; +// } +// +// In this example we can actually vectorize the loop despite the early exit, +// although the loop vectorizer does not support it. It requires some extra +// checks to deal with the possibility of faulting loads when crossing page +// boundaries. However, even with these checks it is still profitable to do the +// transformation. +// +//===--===// +// +// TODO List: +// +// * When optimizing for code size we may want to avoid some transformations. +// * We can also support the inverse case where we scan for a matching element. +// +//===--===// + +#include "AArch64LoopIdiomTransform.h" +#include "llvm/Analysis/DomTreeUpdater.h" +#include "llvm/Analysis/LoopPass.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/MDBuilder.h" +#include "llvm/IR/PatternMatch.h" +#include "llvm/InitializePasses.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" + +using namespace llvm; + +#define DEBUG_TYPE "aarch64-loop-idiom-transform" + +static cl::opt +DisableAll("disable-aarch64-lit-all", cl::Hidden, cl::init(false), + cl::desc("Disable AArch64 Loop Idiom Transform Pass.")); + +static cl::opt DisableByteCmp( +"disable-aarch64-lit-bytecmp", cl::Hidden, cl::init(false), +cl::desc("Proceed with AArch64 Loop Idiom Transform Pass, but do " + "not convert byte-compare loop(s).")); + +static cl::opt VerifyLoops( +"aarch64-lit-verify", cl::Hidden, cl::init(false), +cl::desc("Verify loops generated AArch64 Loop Idiom Transform Pass.")); + +namespace llvm { + +void initializeAArch64LoopIdiomTransformLegacyPassPass(PassRegistry &); +Pass *createAArch64LoopIdiomTransformPass(); + +} // end namespace llvm + +namespace { + +class AArch64LoopIdiomTransform { + Loop *CurLoop = nullptr; + DominatorTree *DT; + LoopInfo *LI; + const TargetTransformInfo *TTI; + const DataLayout *DL; + +public: + explicit AArch64LoopIdiomTransform(DominatorTree *DT, LoopInfo *LI, + const TargetTransformInfo *TTI, + const DataLayout *DL) + : DT(DT), LI(LI), TTI(TTI), DL(DL) {} + + bool run(Loop *L); + +private: + /// \name Countable Loop Idiom Handling + /// @{ + + bool runOnCountableLoop(); + bool runOnLoopBlock(BasicBlock *BB, const SCEV *BECount, + SmallVectorImpl &ExitBlocks); + + bool recognizeByteCompare(); + Value *expandFindMismatch(IRBuilder<> &Builder, GetElementPtrInst *GEPA, +GetElementPtrInst *GEPB, Instruction *Index, +Value *Start, Value *MaxLen); + void transformByteCompare(GetElementPtrInst *GEPA, GetElementPtrInst *GEPB, +PHINode *IndPhi, Value *MaxLen, Instruction *Index, +Value *Start, bool IncIdx, BasicBlock *FoundBB, +BasicBlock *EndBB); + /// @} +}; + +class AArch64LoopIdiomTransformLegacyPass : public LoopPass { +public: + static char ID; + + explicit AArch64LoopIdiomTransformLegacyPass() : LoopPass(ID) { +initializeAArch64LoopIdiomTransformLegacyPassPass( +*PassRegistry::getPassRegistry()); + } + + StringRef getPassName() const override { +return "Transform AArch64-specific loop idioms"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { +AU.addRequired(); +AU.addRequired(); +AU.addRequired(); + } + + bool runOnLoop(Loop *L, LPPassManager &LPM) override; +}; + +bool AArch64LoopIdiomTransformLegacyPass::runOnLoop(Loop *L, +LPPassManager &LPM) { + + if (skipLoop(L)) +return false; + + auto *DT = &getAnalysis().getDomTree(); + auto *LI = &getAnalysis().getLoopInfo(); + auto &TTI = getAnalysis().getTTI( + *L->getHeader()->getParent()); + return AArch64LoopIdiomTransform( + DT, LI, &TTI, &L->getHeader()->getModule()->getDataLayout
[clang-tools-extra] [llvm] [clang] [AArch64] Add an AArch64 pass for loop idiom transformations (PR #72273)
@@ -0,0 +1,816 @@ +//===- AArch64LoopIdiomTransform.cpp - Loop idiom recognition -===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===--===// +// +// This pass implements a pass that recognizes certain loop idioms and +// transforms them into more optimized versions of the same loop. In cases +// where this happens, it can be a significant performance win. +// +// We currently only recognize one loop that finds the first mismatched byte +// in an array and returns the index, i.e. something like: +// +// while (++i != n) { +//if (a[i] != b[i]) +// break; +// } +// +// In this example we can actually vectorize the loop despite the early exit, +// although the loop vectorizer does not support it. It requires some extra +// checks to deal with the possibility of faulting loads when crossing page +// boundaries. However, even with these checks it is still profitable to do the +// transformation. +// +//===--===// +// +// TODO List: +// +// * When optimizing for code size we may want to avoid some transformations. +// * We can also support the inverse case where we scan for a matching element. +// +//===--===// + +#include "AArch64LoopIdiomTransform.h" +#include "llvm/Analysis/DomTreeUpdater.h" +#include "llvm/Analysis/LoopPass.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/MDBuilder.h" +#include "llvm/IR/PatternMatch.h" +#include "llvm/InitializePasses.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" + +using namespace llvm; + +#define DEBUG_TYPE "aarch64-loop-idiom-transform" + +static cl::opt +DisableAll("disable-aarch64-lit-all", cl::Hidden, cl::init(false), + cl::desc("Disable AArch64 Loop Idiom Transform Pass.")); + +static cl::opt DisableByteCmp( +"disable-aarch64-lit-bytecmp", cl::Hidden, cl::init(false), +cl::desc("Proceed with AArch64 Loop Idiom Transform Pass, but do " + "not convert byte-compare loop(s).")); + +static cl::opt VerifyLoops( +"aarch64-lit-verify", cl::Hidden, cl::init(false), +cl::desc("Verify loops generated AArch64 Loop Idiom Transform Pass.")); + +namespace llvm { + +void initializeAArch64LoopIdiomTransformLegacyPassPass(PassRegistry &); +Pass *createAArch64LoopIdiomTransformPass(); + +} // end namespace llvm + +namespace { + +class AArch64LoopIdiomTransform { + Loop *CurLoop = nullptr; + DominatorTree *DT; + LoopInfo *LI; + const TargetTransformInfo *TTI; + const DataLayout *DL; + +public: + explicit AArch64LoopIdiomTransform(DominatorTree *DT, LoopInfo *LI, + const TargetTransformInfo *TTI, + const DataLayout *DL) + : DT(DT), LI(LI), TTI(TTI), DL(DL) {} + + bool run(Loop *L); + +private: + /// \name Countable Loop Idiom Handling + /// @{ + + bool runOnCountableLoop(); + bool runOnLoopBlock(BasicBlock *BB, const SCEV *BECount, + SmallVectorImpl &ExitBlocks); + + bool recognizeByteCompare(); + Value *expandFindMismatch(IRBuilder<> &Builder, GetElementPtrInst *GEPA, +GetElementPtrInst *GEPB, Instruction *Index, +Value *Start, Value *MaxLen); + void transformByteCompare(GetElementPtrInst *GEPA, GetElementPtrInst *GEPB, +PHINode *IndPhi, Value *MaxLen, Instruction *Index, +Value *Start, bool IncIdx, BasicBlock *FoundBB, +BasicBlock *EndBB); + /// @} +}; + +class AArch64LoopIdiomTransformLegacyPass : public LoopPass { +public: + static char ID; + + explicit AArch64LoopIdiomTransformLegacyPass() : LoopPass(ID) { +initializeAArch64LoopIdiomTransformLegacyPassPass( +*PassRegistry::getPassRegistry()); + } + + StringRef getPassName() const override { +return "Transform AArch64-specific loop idioms"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { +AU.addRequired(); +AU.addRequired(); +AU.addRequired(); + } + + bool runOnLoop(Loop *L, LPPassManager &LPM) override; +}; + +bool AArch64LoopIdiomTransformLegacyPass::runOnLoop(Loop *L, +LPPassManager &LPM) { + + if (skipLoop(L)) +return false; + + auto *DT = &getAnalysis().getDomTree(); + auto *LI = &getAnalysis().getLoopInfo(); + auto &TTI = getAnalysis().getTTI( + *L->getHeader()->getParent()); + return AArch64LoopIdiomTransform( + DT, LI, &TTI, &L->getHeader()->getModule()->getDataLayout
[llvm] [clang] [clang-tools-extra] [AArch64] Add an AArch64 pass for loop idiom transformations (PR #72273)
@@ -0,0 +1,816 @@ +//===- AArch64LoopIdiomTransform.cpp - Loop idiom recognition -===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===--===// +// +// This pass implements a pass that recognizes certain loop idioms and +// transforms them into more optimized versions of the same loop. In cases +// where this happens, it can be a significant performance win. +// +// We currently only recognize one loop that finds the first mismatched byte +// in an array and returns the index, i.e. something like: +// +// while (++i != n) { +//if (a[i] != b[i]) +// break; +// } +// +// In this example we can actually vectorize the loop despite the early exit, +// although the loop vectorizer does not support it. It requires some extra +// checks to deal with the possibility of faulting loads when crossing page +// boundaries. However, even with these checks it is still profitable to do the +// transformation. +// +//===--===// +// +// TODO List: +// +// * When optimizing for code size we may want to avoid some transformations. +// * We can also support the inverse case where we scan for a matching element. +// +//===--===// + +#include "AArch64LoopIdiomTransform.h" +#include "llvm/Analysis/DomTreeUpdater.h" +#include "llvm/Analysis/LoopPass.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/MDBuilder.h" +#include "llvm/IR/PatternMatch.h" +#include "llvm/InitializePasses.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" + +using namespace llvm; + +#define DEBUG_TYPE "aarch64-loop-idiom-transform" + +static cl::opt +DisableAll("disable-aarch64-lit-all", cl::Hidden, cl::init(false), + cl::desc("Disable AArch64 Loop Idiom Transform Pass.")); + +static cl::opt DisableByteCmp( +"disable-aarch64-lit-bytecmp", cl::Hidden, cl::init(false), +cl::desc("Proceed with AArch64 Loop Idiom Transform Pass, but do " + "not convert byte-compare loop(s).")); + +static cl::opt VerifyLoops( +"aarch64-lit-verify", cl::Hidden, cl::init(false), +cl::desc("Verify loops generated AArch64 Loop Idiom Transform Pass.")); + +namespace llvm { + +void initializeAArch64LoopIdiomTransformLegacyPassPass(PassRegistry &); +Pass *createAArch64LoopIdiomTransformPass(); + +} // end namespace llvm + +namespace { + +class AArch64LoopIdiomTransform { + Loop *CurLoop = nullptr; + DominatorTree *DT; + LoopInfo *LI; + const TargetTransformInfo *TTI; + const DataLayout *DL; + +public: + explicit AArch64LoopIdiomTransform(DominatorTree *DT, LoopInfo *LI, + const TargetTransformInfo *TTI, + const DataLayout *DL) + : DT(DT), LI(LI), TTI(TTI), DL(DL) {} + + bool run(Loop *L); + +private: + /// \name Countable Loop Idiom Handling + /// @{ + + bool runOnCountableLoop(); + bool runOnLoopBlock(BasicBlock *BB, const SCEV *BECount, + SmallVectorImpl &ExitBlocks); + + bool recognizeByteCompare(); + Value *expandFindMismatch(IRBuilder<> &Builder, GetElementPtrInst *GEPA, +GetElementPtrInst *GEPB, Instruction *Index, +Value *Start, Value *MaxLen); + void transformByteCompare(GetElementPtrInst *GEPA, GetElementPtrInst *GEPB, +PHINode *IndPhi, Value *MaxLen, Instruction *Index, +Value *Start, bool IncIdx, BasicBlock *FoundBB, +BasicBlock *EndBB); + /// @} +}; + +class AArch64LoopIdiomTransformLegacyPass : public LoopPass { +public: + static char ID; + + explicit AArch64LoopIdiomTransformLegacyPass() : LoopPass(ID) { +initializeAArch64LoopIdiomTransformLegacyPassPass( +*PassRegistry::getPassRegistry()); + } + + StringRef getPassName() const override { +return "Transform AArch64-specific loop idioms"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { +AU.addRequired(); +AU.addRequired(); +AU.addRequired(); + } + + bool runOnLoop(Loop *L, LPPassManager &LPM) override; +}; + +bool AArch64LoopIdiomTransformLegacyPass::runOnLoop(Loop *L, +LPPassManager &LPM) { + + if (skipLoop(L)) +return false; + + auto *DT = &getAnalysis().getDomTree(); + auto *LI = &getAnalysis().getLoopInfo(); + auto &TTI = getAnalysis().getTTI( + *L->getHeader()->getParent()); + return AArch64LoopIdiomTransform( + DT, LI, &TTI, &L->getHeader()->getModule()->getDataLayout
[clang-tools-extra] [clang] [llvm] [LoopVectorize] Refine runtime memory check costs when there is an outer loop (PR #76034)
https://github.com/david-arm updated https://github.com/llvm/llvm-project/pull/76034 >From a4caa47dc8d2db75f6bb2ac3f880da4e1f6bea82 Mon Sep 17 00:00:00 2001 From: David Sherwood Date: Tue, 19 Dec 2023 16:07:33 + Subject: [PATCH 1/2] Add tests showing runtime checks cost with low trip counts --- .../AArch64/low_trip_memcheck_cost.ll | 187 ++ 1 file changed, 187 insertions(+) create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/low_trip_memcheck_cost.ll diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_memcheck_cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_memcheck_cost.ll new file mode 100644 index 00..397521c2d3dc8f --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_memcheck_cost.ll @@ -0,0 +1,187 @@ +; REQUIRES: asserts +; RUN: opt -p loop-vectorize -debug-only=loop-vectorize -S -disable-output < %s 2>&1 | FileCheck %s + +target triple = "aarch64-unknown-linux-gnu" + + +define void @outer_no_tc(ptr nocapture noundef %a, ptr nocapture noundef readonly %b, i64 noundef %m, i64 noundef %n) { +; CHECK-LABEL: LV: Checking a loop in 'outer_no_tc' +; CHECK: Calculating cost of runtime checks: +; CHECK: Total cost of runtime checks: 6 +; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16 +entry: + br label %outer.loop + +outer.loop: + %outer.iv = phi i64 [ %outer.iv.next, %inner.exit ], [ 0, %entry ] + %mul.us = mul nsw i64 %outer.iv, %n + br label %inner.loop + +inner.loop: + %inner.iv = phi i64 [ 0, %outer.loop ], [ %inner.iv.next, %inner.loop ] + %add.us = add nuw nsw i64 %inner.iv, %mul.us + %arrayidx.us = getelementptr inbounds i8, ptr %b, i64 %add.us + %0 = load i8, ptr %arrayidx.us, align 1 + %arrayidx7.us = getelementptr inbounds i8, ptr %a, i64 %add.us + %1 = load i8, ptr %arrayidx7.us, align 1 + %add9.us = add i8 %1, %0 + store i8 %add9.us, ptr %arrayidx7.us, align 1 + %inner.iv.next = add nuw nsw i64 %inner.iv, 1 + %exitcond.not = icmp eq i64 %inner.iv.next, %n + br i1 %exitcond.not, label %inner.exit, label %inner.loop + +inner.exit: + %outer.iv.next = add nuw nsw i64 %outer.iv, 1 + %exitcond27.not = icmp eq i64 %outer.iv.next, %m + br i1 %exitcond27.not, label %outer.exit, label %outer.loop + +outer.exit: + ret void +} + + +define void @outer_known_tc3(ptr nocapture noundef %a, ptr nocapture noundef readonly %b, i64 noundef %n) { +; CHECK-LABEL: LV: Checking a loop in 'outer_known_tc3' +; CHECK: Calculating cost of runtime checks: +; CHECK: Total cost of runtime checks: 6 +; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16 +entry: + br label %outer.loop + +outer.loop: + %outer.iv = phi i64 [ %outer.iv.next, %inner.exit ], [ 0, %entry ] + %mul.us = mul nsw i64 %outer.iv, %n + br label %inner.loop + +inner.loop: + %inner.iv = phi i64 [ 0, %outer.loop ], [ %inner.iv.next, %inner.loop ] + %add.us = add nuw nsw i64 %inner.iv, %mul.us + %arrayidx.us = getelementptr inbounds i8, ptr %b, i64 %add.us + %0 = load i8, ptr %arrayidx.us, align 1 + %arrayidx7.us = getelementptr inbounds i8, ptr %a, i64 %add.us + %1 = load i8, ptr %arrayidx7.us, align 1 + %add9.us = add i8 %1, %0 + store i8 %add9.us, ptr %arrayidx7.us, align 1 + %inner.iv.next = add nuw nsw i64 %inner.iv, 1 + %exitcond.not = icmp eq i64 %inner.iv.next, %n + br i1 %exitcond.not, label %inner.exit, label %inner.loop + +inner.exit: + %outer.iv.next = add nuw nsw i64 %outer.iv, 1 + %exitcond26.not = icmp eq i64 %outer.iv.next, 3 + br i1 %exitcond26.not, label %outer.exit, label %outer.loop + +outer.exit: + ret void +} + + +define void @outer_known_tc64(ptr nocapture noundef %a, ptr nocapture noundef readonly %b, i64 noundef %n) { +; CHECK-LABEL: LV: Checking a loop in 'outer_known_tc64' +; CHECK: Calculating cost of runtime checks: +; CHECK: Total cost of runtime checks: 6 +; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16 +entry: + br label %outer.loop + +outer.loop: + %outer.iv = phi i64 [ %outer.iv.next, %inner.exit ], [ 0, %entry ] + %mul.us = mul nsw i64 %outer.iv, %n + br label %inner.loop + +inner.loop: + %inner.iv = phi i64 [ 0, %outer.loop ], [ %inner.iv.next, %inner.loop ] + %add.us = add nuw nsw i64 %inner.iv, %mul.us + %arrayidx.us = getelementptr inbounds i8, ptr %b, i64 %add.us + %0 = load i8, ptr %arrayidx.us, align 1 + %arrayidx7.us = getelementptr inbounds i8, ptr %a, i64 %add.us + %1 = load i8, ptr %arrayidx7.us, align 1 + %add9.us = add i8 %1, %0 + store i8 %add9.us, ptr %arrayidx7.us, align 1 + %inner.iv.next = add nuw nsw i64 %inner.iv, 1 + %exitcond.not = icmp eq i64 %inner.iv.next, %n + br i1 %exitcond.not, label %inner.exit, label %inner.loop + +inner.exit: + %outer.iv.next = add nuw nsw i64 %outer.iv, 1 + %exitcond26.not = icmp eq i64 %outer.iv.next, 64 + br i1 %exitcond26.not, label %outer.exit, label %outer.loop + +outer.exit: + ret void +} + + +defi
[clang] [llvm] [clang-tools-extra] [LoopVectorize] Refine runtime memory check costs when there is an outer loop (PR #76034)
https://github.com/david-arm updated https://github.com/llvm/llvm-project/pull/76034 >From a4caa47dc8d2db75f6bb2ac3f880da4e1f6bea82 Mon Sep 17 00:00:00 2001 From: David Sherwood Date: Tue, 19 Dec 2023 16:07:33 + Subject: [PATCH 1/6] Add tests showing runtime checks cost with low trip counts --- .../AArch64/low_trip_memcheck_cost.ll | 187 ++ 1 file changed, 187 insertions(+) create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/low_trip_memcheck_cost.ll diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_memcheck_cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_memcheck_cost.ll new file mode 100644 index 00..397521c2d3dc8f --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_memcheck_cost.ll @@ -0,0 +1,187 @@ +; REQUIRES: asserts +; RUN: opt -p loop-vectorize -debug-only=loop-vectorize -S -disable-output < %s 2>&1 | FileCheck %s + +target triple = "aarch64-unknown-linux-gnu" + + +define void @outer_no_tc(ptr nocapture noundef %a, ptr nocapture noundef readonly %b, i64 noundef %m, i64 noundef %n) { +; CHECK-LABEL: LV: Checking a loop in 'outer_no_tc' +; CHECK: Calculating cost of runtime checks: +; CHECK: Total cost of runtime checks: 6 +; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16 +entry: + br label %outer.loop + +outer.loop: + %outer.iv = phi i64 [ %outer.iv.next, %inner.exit ], [ 0, %entry ] + %mul.us = mul nsw i64 %outer.iv, %n + br label %inner.loop + +inner.loop: + %inner.iv = phi i64 [ 0, %outer.loop ], [ %inner.iv.next, %inner.loop ] + %add.us = add nuw nsw i64 %inner.iv, %mul.us + %arrayidx.us = getelementptr inbounds i8, ptr %b, i64 %add.us + %0 = load i8, ptr %arrayidx.us, align 1 + %arrayidx7.us = getelementptr inbounds i8, ptr %a, i64 %add.us + %1 = load i8, ptr %arrayidx7.us, align 1 + %add9.us = add i8 %1, %0 + store i8 %add9.us, ptr %arrayidx7.us, align 1 + %inner.iv.next = add nuw nsw i64 %inner.iv, 1 + %exitcond.not = icmp eq i64 %inner.iv.next, %n + br i1 %exitcond.not, label %inner.exit, label %inner.loop + +inner.exit: + %outer.iv.next = add nuw nsw i64 %outer.iv, 1 + %exitcond27.not = icmp eq i64 %outer.iv.next, %m + br i1 %exitcond27.not, label %outer.exit, label %outer.loop + +outer.exit: + ret void +} + + +define void @outer_known_tc3(ptr nocapture noundef %a, ptr nocapture noundef readonly %b, i64 noundef %n) { +; CHECK-LABEL: LV: Checking a loop in 'outer_known_tc3' +; CHECK: Calculating cost of runtime checks: +; CHECK: Total cost of runtime checks: 6 +; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16 +entry: + br label %outer.loop + +outer.loop: + %outer.iv = phi i64 [ %outer.iv.next, %inner.exit ], [ 0, %entry ] + %mul.us = mul nsw i64 %outer.iv, %n + br label %inner.loop + +inner.loop: + %inner.iv = phi i64 [ 0, %outer.loop ], [ %inner.iv.next, %inner.loop ] + %add.us = add nuw nsw i64 %inner.iv, %mul.us + %arrayidx.us = getelementptr inbounds i8, ptr %b, i64 %add.us + %0 = load i8, ptr %arrayidx.us, align 1 + %arrayidx7.us = getelementptr inbounds i8, ptr %a, i64 %add.us + %1 = load i8, ptr %arrayidx7.us, align 1 + %add9.us = add i8 %1, %0 + store i8 %add9.us, ptr %arrayidx7.us, align 1 + %inner.iv.next = add nuw nsw i64 %inner.iv, 1 + %exitcond.not = icmp eq i64 %inner.iv.next, %n + br i1 %exitcond.not, label %inner.exit, label %inner.loop + +inner.exit: + %outer.iv.next = add nuw nsw i64 %outer.iv, 1 + %exitcond26.not = icmp eq i64 %outer.iv.next, 3 + br i1 %exitcond26.not, label %outer.exit, label %outer.loop + +outer.exit: + ret void +} + + +define void @outer_known_tc64(ptr nocapture noundef %a, ptr nocapture noundef readonly %b, i64 noundef %n) { +; CHECK-LABEL: LV: Checking a loop in 'outer_known_tc64' +; CHECK: Calculating cost of runtime checks: +; CHECK: Total cost of runtime checks: 6 +; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16 +entry: + br label %outer.loop + +outer.loop: + %outer.iv = phi i64 [ %outer.iv.next, %inner.exit ], [ 0, %entry ] + %mul.us = mul nsw i64 %outer.iv, %n + br label %inner.loop + +inner.loop: + %inner.iv = phi i64 [ 0, %outer.loop ], [ %inner.iv.next, %inner.loop ] + %add.us = add nuw nsw i64 %inner.iv, %mul.us + %arrayidx.us = getelementptr inbounds i8, ptr %b, i64 %add.us + %0 = load i8, ptr %arrayidx.us, align 1 + %arrayidx7.us = getelementptr inbounds i8, ptr %a, i64 %add.us + %1 = load i8, ptr %arrayidx7.us, align 1 + %add9.us = add i8 %1, %0 + store i8 %add9.us, ptr %arrayidx7.us, align 1 + %inner.iv.next = add nuw nsw i64 %inner.iv, 1 + %exitcond.not = icmp eq i64 %inner.iv.next, %n + br i1 %exitcond.not, label %inner.exit, label %inner.loop + +inner.exit: + %outer.iv.next = add nuw nsw i64 %outer.iv, 1 + %exitcond26.not = icmp eq i64 %outer.iv.next, 64 + br i1 %exitcond26.not, label %outer.exit, label %outer.loop + +outer.exit: + ret void +} + + +defi
[llvm] [clang] [clang-tools-extra] [AArch64] Add an AArch64 pass for loop idiom transformations (PR #72273)
https://github.com/david-arm closed https://github.com/llvm/llvm-project/pull/72273 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [clang-tools-extra] [llvm] [AArch64] Add an AArch64 pass for loop idiom transformations (PR #72273)
david-arm wrote: Hi @dyung, sorry about this! It passed for me locally. It sounds like it needs a REQUIRED aarch64-target somewhere then. I'll try to fix it asap. https://github.com/llvm/llvm-project/pull/72273 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang-tools-extra] [llvm] [clang] [AArch64] Add an AArch64 pass for loop idiom transformations (PR #72273)
david-arm wrote: @dyung - fix pending here https://github.com/llvm/llvm-project/pull/77467 https://github.com/llvm/llvm-project/pull/72273 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [clang-tools-extra] [llvm] [AArch64] Add an AArch64 pass for loop idiom transformations (PR #72273)
david-arm wrote: @dyung - fix pending here https://github.com/llvm/llvm-project/pull/77467 https://github.com/llvm/llvm-project/pull/72273 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[llvm] [clang] [AArch64][SME] Fix multi vector cvt builtins (PR #77656)
@@ -34,118 +34,118 @@ define @multi_vector_cvt_x2_bf16( %unu ; ; FCVTZS ; -define {, } @multi_vector_cvt_x2_f32_s32( %unused, %zn0, %zn1) { -; CHECK-LABEL: multi_vector_cvt_x2_f32_s32: +define {, } @multi_vector_cvt_x2_s32_f32( %unused, %zn0, %zn1) { +; CHECK-LABEL: multi_vector_cvt_x2_s32_f32: ; CHECK: // %bb.0: ; CHECK-NEXT:mov z3.d, z2.d ; CHECK-NEXT:mov z2.d, z1.d ; CHECK-NEXT:fcvtzs { z0.s, z1.s }, { z2.s, z3.s } ; CHECK-NEXT:ret - %res = call {, } @llvm.aarch64.sve.fcvts.x2.nxv4f32(%zn0, %zn1) - ret {, } %res + %res = call {, } @llvm.aarch64.sve.fcvts.x2.nxv4f32( %zn0, %zn1) + ret {, } %res } -define {, ,, } @multi_vector_cvt_x4_f32_s32( %unused, %zn0, %zn1, %zn2, %zn3) { -; CHECK-LABEL: multi_vector_cvt_x4_f32_s32: +define {, ,, } @multi_vector_cvt_x4_s32_f32( %unused, %zn0, %zn1, %zn2, %zn3) { +; CHECK-LABEL: multi_vector_cvt_x4_s32_f32: ; CHECK: // %bb.0: ; CHECK-NEXT:mov z7.d, z4.d ; CHECK-NEXT:mov z6.d, z3.d ; CHECK-NEXT:mov z5.d, z2.d ; CHECK-NEXT:mov z4.d, z1.d ; CHECK-NEXT:fcvtzs { z0.s - z3.s }, { z4.s - z7.s } ; CHECK-NEXT:ret - %res = call {, ,, } @llvm.aarch64.sve.fcvts.x4.nxv4f32(%zn0, %zn1, %zn2, %zn3) - ret {, , , } %res + %res = call {, ,, } @llvm.aarch64.sve.fcvts.x4.nxv4f32( %zn0, %zn1, %zn2, %zn3) + ret {, , , } %res } ; ; FCVTZU ; -define {, } @multi_vector_cvt_x2_f32_u32( %unused, %zn0, %zn1) { -; CHECK-LABEL: multi_vector_cvt_x2_f32_u32: +define {, } @multi_vector_cvt_x2_u32_f32( %unused, %zn0, %zn1) { +; CHECK-LABEL: multi_vector_cvt_x2_u32_f32: ; CHECK: // %bb.0: ; CHECK-NEXT:mov z3.d, z2.d ; CHECK-NEXT:mov z2.d, z1.d ; CHECK-NEXT:fcvtzu { z0.s, z1.s }, { z2.s, z3.s } ; CHECK-NEXT:ret - %res = call {, } @llvm.aarch64.sve.fcvtu.x2.nxv4f32(%zn0, %zn1) - ret {, } %res + %res = call {, } @llvm.aarch64.sve.fcvtu.x2.nxv4f32( %zn0, %zn1) + ret {, } %res } -define {, , , } @multi_vector_cvt_x4_f32_u32( %unused, %zn0, %zn1, %zn2, %zn3) { -; CHECK-LABEL: multi_vector_cvt_x4_f32_u32: +define {, , , } @multi_vector_cvt_x4_u32_f32( %unused, %zn0, %zn1, %zn2, %zn3) { +; CHECK-LABEL: multi_vector_cvt_x4_u32_f32: ; CHECK: // %bb.0: ; CHECK-NEXT:mov z7.d, z4.d ; CHECK-NEXT:mov z6.d, z3.d ; CHECK-NEXT:mov z5.d, z2.d ; CHECK-NEXT:mov z4.d, z1.d ; CHECK-NEXT:fcvtzu { z0.s - z3.s }, { z4.s - z7.s } ; CHECK-NEXT:ret - %res = call {, ,, } @llvm.aarch64.sve.fcvtu.x4.nxv4f32(%zn0, %zn1, %zn2, %zn3) - ret {, , , } %res + %res = call {, ,, } @llvm.aarch64.sve.fcvtu.x4.nxv4f32( %zn0, %zn1, %zn2, %zn3) + ret {, , , } %res } ; ; SCVTF ; -define {, } @multi_vector_cvt_x2_s32_f32(%unused, %zn0, %zn1) { -; CHECK-LABEL: multi_vector_cvt_x2_s32_f32: +define {, } @multi_vector_cvt_x2_f32_s32( %unused, %zn0, %zn1) { +; CHECK-LABEL: multi_vector_cvt_x2_f32_s32: ; CHECK: // %bb.0: ; CHECK-NEXT:mov z3.d, z2.d ; CHECK-NEXT:mov z2.d, z1.d ; CHECK-NEXT:scvtf { z0.s, z1.s }, { z2.s, z3.s } ; CHECK-NEXT:ret - %res = call {, } @llvm.aarch64.sve.scvtf.x2.nxv4f32(%zn0, %zn1) - ret {, } %res + %res = call {, } @llvm.aarch64.sve.scvtf.x2.nxv4i32( %zn0, %zn1) david-arm wrote: Shouldn't the intrinsic name be `@llvm.aarch64.sve.scvtf.x2.nxv4f32` because the intrinsics are all keyed off the floating point type, with bitcasts of the variable FP type to an integer type. I realise this does seem to work, but perhaps it's clearer to use the correct type. https://github.com/llvm/llvm-project/pull/77656 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] ceb6c23 - [NFC][LoopVectorize] Explicitly disable tail-folding on some SVE tests
Author: David Sherwood Date: 2022-07-21T15:23:00+01:00 New Revision: ceb6c23b708d4cae3fbb0a569c5ac14069524a63 URL: https://github.com/llvm/llvm-project/commit/ceb6c23b708d4cae3fbb0a569c5ac14069524a63 DIFF: https://github.com/llvm/llvm-project/commit/ceb6c23b708d4cae3fbb0a569c5ac14069524a63.diff LOG: [NFC][LoopVectorize] Explicitly disable tail-folding on some SVE tests This patch is in preparation for enabling vectorisation with tail-folding by default for SVE targets. Once we do that many existing tests will break that depend upon having normal unpredicated vector loops. For all such tests I have added the flag: -prefer-predicate-over-epilogue=scalar-epilogue Differential Revision: https://reviews.llvm.org/D129137 Added: Modified: clang/test/CodeGen/aarch64-sve-vector-bits-codegen.c llvm/test/Transforms/LoopVectorize/AArch64/gather-do-not-vectorize-addressing.ll llvm/test/Transforms/LoopVectorize/AArch64/i1-reg-usage.ll llvm/test/Transforms/LoopVectorize/AArch64/scalable-call.ll llvm/test/Transforms/LoopVectorize/AArch64/scalable-reduction-inloop-cond.ll llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll llvm/test/Transforms/LoopVectorize/AArch64/scalarize-store-with-predication.ll llvm/test/Transforms/LoopVectorize/AArch64/sve-basic-vec.ll llvm/test/Transforms/LoopVectorize/AArch64/sve-cond-inv-loads.ll llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect.ll llvm/test/Transforms/LoopVectorize/AArch64/sve-fneg.ll llvm/test/Transforms/LoopVectorize/AArch64/sve-gather-scatter-cost.ll llvm/test/Transforms/LoopVectorize/AArch64/sve-gather-scatter.ll llvm/test/Transforms/LoopVectorize/AArch64/sve-illegal-type.ll llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions-unusual-types.ll llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions.ll llvm/test/Transforms/LoopVectorize/AArch64/sve-inv-loads.ll llvm/test/Transforms/LoopVectorize/AArch64/sve-inv-store.ll llvm/test/Transforms/LoopVectorize/AArch64/sve-large-strides.ll llvm/test/Transforms/LoopVectorize/AArch64/sve-masked-loadstore.ll llvm/test/Transforms/LoopVectorize/AArch64/sve-runtime-check-size-based-threshold.ll llvm/test/Transforms/LoopVectorize/AArch64/sve-select-cmp.ll llvm/test/Transforms/LoopVectorize/AArch64/sve-strict-fadd-cost.ll llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse-mask4.ll llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse.ll llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-gep.ll llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll llvm/test/Transforms/LoopVectorize/AArch64/vector-reverse-mask4.ll llvm/test/Transforms/LoopVectorize/AArch64/vector-reverse.ll Removed: diff --git a/clang/test/CodeGen/aarch64-sve-vector-bits-codegen.c b/clang/test/CodeGen/aarch64-sve-vector-bits-codegen.c index bccd328f0ccad..e306f44c27fb3 100644 --- a/clang/test/CodeGen/aarch64-sve-vector-bits-codegen.c +++ b/clang/test/CodeGen/aarch64-sve-vector-bits-codegen.c @@ -1,7 +1,11 @@ -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -fallow-half-arguments-and-returns -O2 -S -o - %s -mvscale-min=2 -mvscale-max=2 | FileCheck %s --check-prefixes=CHECK,CHECK256 -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -fallow-half-arguments-and-returns -O2 -S -o - %s -mvscale-min=4 -mvscale-max=4 | FileCheck %s --check-prefixes=CHECK,CHECK512 -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -fallow-half-arguments-and-returns -O2 -S -o - %s -mvscale-min=8 -mvscale-max=8 | FileCheck %s --check-prefixes=CHECK,CHECK1024 -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -fallow-half-arguments-and-returns -O2 -S -o - %s -mvscale-min=16 -mvscale-max=16 | FileCheck %s --check-prefixes=CHECK,CHECK2048 +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -fallow-half-arguments-and-returns -O2 -S \ +// RUN: -mllvm -prefer-predicate-over-epilogue=scalar-epilogue -o - %s -mvscale-min=2 -mvscale-max=2 | FileCheck %s --check-prefixes=CHECK,CHECK256 +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -fallow-half-arguments-and-returns -O2 -S \ +// RUN: -mllvm -prefer-predicate-over-epilogue=scalar-epilogue -o - %s -mvscale-min=4 -mvscale-max=4 | FileCheck %s --check-prefixes=CHECK,CHECK512 +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -fallow-half-arguments-and-returns -O2 -S \ +// RUN: -mllvm -prefer-predicate-over-epilogue=scalar-epilogue -o - %s -mvscale-min=8 -mvscale-max=8 | FileCheck %s --check-prefixes=CHECK,CHECK1024 +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -fallow-half-arguments-and-returns -O2 -S \ +// RUN: -m
[clang] [llvm] [clang-tools-extra] [AArch64] Add an AArch64 pass for loop idiom transformations (PR #72273)
david-arm wrote: > Hi! I wonder that have you conducted any tests to determine the potential > performance increase of this pass in the SPEC2017 557xz benchmark? I > attempted to apply it to the xz benchmark, but only one copy(--copies=1) > demonstrated a significant increase(about 3%), but there was no increase when > I set --copies=128 or higher. Do you have any suggestions or test results > that you could share? The most significant gains with xz have already been achieved when https://github.com/llvm/llvm-project/pull/77480 and https://github.com/llvm/llvm-project/pull/77480 landed, which improved performance by 6-7% for neoverse-v1. This PR is a NFC refactoring patch so it won't improve performance further. My follow-on patch (not yet posted) will trigger more cases in xz, but I don't expect any substantial performance gains for xz. The main goal of extending this pass further is to improve code coverage and testing, and hopefully there will be other applications besides xz that will benefit too. https://github.com/llvm/llvm-project/pull/72273 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [LTO] Fix Veclib flags correctly pass to LTO flags (PR #78749)
https://github.com/david-arm approved this pull request. LGTM as well! https://github.com/llvm/llvm-project/pull/78749 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[llvm] [clang-tools-extra] [clang] [LoopVectorize] Refine runtime memory check costs when there is an outer loop (PR #76034)
david-arm wrote: Gentle ping! https://github.com/llvm/llvm-project/pull/76034 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang-tools-extra] [llvm] [clang] [LoopVectorize] Refine runtime memory check costs when there is an outer loop (PR #76034)
https://github.com/david-arm updated https://github.com/llvm/llvm-project/pull/76034 >From a4caa47dc8d2db75f6bb2ac3f880da4e1f6bea82 Mon Sep 17 00:00:00 2001 From: David Sherwood Date: Tue, 19 Dec 2023 16:07:33 + Subject: [PATCH 1/6] Add tests showing runtime checks cost with low trip counts --- .../AArch64/low_trip_memcheck_cost.ll | 187 ++ 1 file changed, 187 insertions(+) create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/low_trip_memcheck_cost.ll diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_memcheck_cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_memcheck_cost.ll new file mode 100644 index 00..397521c2d3dc8f --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_memcheck_cost.ll @@ -0,0 +1,187 @@ +; REQUIRES: asserts +; RUN: opt -p loop-vectorize -debug-only=loop-vectorize -S -disable-output < %s 2>&1 | FileCheck %s + +target triple = "aarch64-unknown-linux-gnu" + + +define void @outer_no_tc(ptr nocapture noundef %a, ptr nocapture noundef readonly %b, i64 noundef %m, i64 noundef %n) { +; CHECK-LABEL: LV: Checking a loop in 'outer_no_tc' +; CHECK: Calculating cost of runtime checks: +; CHECK: Total cost of runtime checks: 6 +; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16 +entry: + br label %outer.loop + +outer.loop: + %outer.iv = phi i64 [ %outer.iv.next, %inner.exit ], [ 0, %entry ] + %mul.us = mul nsw i64 %outer.iv, %n + br label %inner.loop + +inner.loop: + %inner.iv = phi i64 [ 0, %outer.loop ], [ %inner.iv.next, %inner.loop ] + %add.us = add nuw nsw i64 %inner.iv, %mul.us + %arrayidx.us = getelementptr inbounds i8, ptr %b, i64 %add.us + %0 = load i8, ptr %arrayidx.us, align 1 + %arrayidx7.us = getelementptr inbounds i8, ptr %a, i64 %add.us + %1 = load i8, ptr %arrayidx7.us, align 1 + %add9.us = add i8 %1, %0 + store i8 %add9.us, ptr %arrayidx7.us, align 1 + %inner.iv.next = add nuw nsw i64 %inner.iv, 1 + %exitcond.not = icmp eq i64 %inner.iv.next, %n + br i1 %exitcond.not, label %inner.exit, label %inner.loop + +inner.exit: + %outer.iv.next = add nuw nsw i64 %outer.iv, 1 + %exitcond27.not = icmp eq i64 %outer.iv.next, %m + br i1 %exitcond27.not, label %outer.exit, label %outer.loop + +outer.exit: + ret void +} + + +define void @outer_known_tc3(ptr nocapture noundef %a, ptr nocapture noundef readonly %b, i64 noundef %n) { +; CHECK-LABEL: LV: Checking a loop in 'outer_known_tc3' +; CHECK: Calculating cost of runtime checks: +; CHECK: Total cost of runtime checks: 6 +; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16 +entry: + br label %outer.loop + +outer.loop: + %outer.iv = phi i64 [ %outer.iv.next, %inner.exit ], [ 0, %entry ] + %mul.us = mul nsw i64 %outer.iv, %n + br label %inner.loop + +inner.loop: + %inner.iv = phi i64 [ 0, %outer.loop ], [ %inner.iv.next, %inner.loop ] + %add.us = add nuw nsw i64 %inner.iv, %mul.us + %arrayidx.us = getelementptr inbounds i8, ptr %b, i64 %add.us + %0 = load i8, ptr %arrayidx.us, align 1 + %arrayidx7.us = getelementptr inbounds i8, ptr %a, i64 %add.us + %1 = load i8, ptr %arrayidx7.us, align 1 + %add9.us = add i8 %1, %0 + store i8 %add9.us, ptr %arrayidx7.us, align 1 + %inner.iv.next = add nuw nsw i64 %inner.iv, 1 + %exitcond.not = icmp eq i64 %inner.iv.next, %n + br i1 %exitcond.not, label %inner.exit, label %inner.loop + +inner.exit: + %outer.iv.next = add nuw nsw i64 %outer.iv, 1 + %exitcond26.not = icmp eq i64 %outer.iv.next, 3 + br i1 %exitcond26.not, label %outer.exit, label %outer.loop + +outer.exit: + ret void +} + + +define void @outer_known_tc64(ptr nocapture noundef %a, ptr nocapture noundef readonly %b, i64 noundef %n) { +; CHECK-LABEL: LV: Checking a loop in 'outer_known_tc64' +; CHECK: Calculating cost of runtime checks: +; CHECK: Total cost of runtime checks: 6 +; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16 +entry: + br label %outer.loop + +outer.loop: + %outer.iv = phi i64 [ %outer.iv.next, %inner.exit ], [ 0, %entry ] + %mul.us = mul nsw i64 %outer.iv, %n + br label %inner.loop + +inner.loop: + %inner.iv = phi i64 [ 0, %outer.loop ], [ %inner.iv.next, %inner.loop ] + %add.us = add nuw nsw i64 %inner.iv, %mul.us + %arrayidx.us = getelementptr inbounds i8, ptr %b, i64 %add.us + %0 = load i8, ptr %arrayidx.us, align 1 + %arrayidx7.us = getelementptr inbounds i8, ptr %a, i64 %add.us + %1 = load i8, ptr %arrayidx7.us, align 1 + %add9.us = add i8 %1, %0 + store i8 %add9.us, ptr %arrayidx7.us, align 1 + %inner.iv.next = add nuw nsw i64 %inner.iv, 1 + %exitcond.not = icmp eq i64 %inner.iv.next, %n + br i1 %exitcond.not, label %inner.exit, label %inner.loop + +inner.exit: + %outer.iv.next = add nuw nsw i64 %outer.iv, 1 + %exitcond26.not = icmp eq i64 %outer.iv.next, 64 + br i1 %exitcond26.not, label %outer.exit, label %outer.loop + +outer.exit: + ret void +} + + +defi
[llvm] [clang-tools-extra] [clang] [LoopVectorize] Refine runtime memory check costs when there is an outer loop (PR #76034)
@@ -2076,16 +2081,61 @@ class GeneratedRTChecks { LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n"); RTCheckCost += C; } -if (MemCheckBlock) +if (MemCheckBlock) { + InstructionCost MemCheckCost = 0; for (Instruction &I : *MemCheckBlock) { if (MemCheckBlock->getTerminator() == &I) continue; InstructionCost C = TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput); LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n"); -RTCheckCost += C; +MemCheckCost += C; } + // If the runtime memory checks are being created inside an outer loop + // we should find out if these checks are outer loop invariant. If so, + // the checks will likely be hoisted out and so the effective cost will + // reduce according to the outer loop trip count. + if (OuterLoop) { +ScalarEvolution *SE = MemCheckExp.getSE(); +// TODO: We could refine this further by analysing every individual +// memory check, since there could be a mixture of loop variant and +// invariant checks that mean the final condition is variant. However, +// I think it would need further analysis to prove this is beneficial. +const SCEV *Cond = SE->getSCEV(MemRuntimeCheckCond); +if (SE->isLoopInvariant(Cond, OuterLoop)) { + // It seems reasonable to assume that we can reduce the effective + // cost of the checks even when we know nothing about the trip + // count. Here I've assumed that the outer loop executes at least + // twice. + unsigned BestTripCount = 2; + + // If exact trip count is known use that. + if (unsigned SmallTC = SE->getSmallConstantTripCount(OuterLoop)) +BestTripCount = SmallTC; + else if (LoopVectorizeWithBlockFrequency) { +// Else use profile data if available. +if (auto EstimatedTC = getLoopEstimatedTripCount(OuterLoop)) + BestTripCount = *EstimatedTC; + } + + InstructionCost NewMemCheckCost = MemCheckCost / BestTripCount; + + // Let's ensure the cost is always at least 1. + NewMemCheckCost = std::max(*NewMemCheckCost.getValue(), (long)1); david-arm wrote: Good spot! I hope I've fixed it now. :) https://github.com/llvm/llvm-project/pull/76034 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [LTO] Fix Veclib flags correctly pass to LTO flags (PR #78749)
@@ -31,3 +31,31 @@ // RUN: %clang -fveclib=Accelerate %s -nodefaultlibs -target arm64-apple-ios8.0.0 -### 2>&1 | FileCheck --check-prefix=CHECK-LINK-NODEFAULTLIBS %s // CHECK-LINK-NODEFAULTLIBS-NOT: "-framework" "Accelerate" + + +/* Verify that the correct vector library is passed to LTO flags. */ + + +// RUN: %clang -### -fveclib=none -flto %s -v 2>&1 | FileCheck -check-prefix CHECK-LTO-NOLIB %s +// CHECK-LTO-NOLIB: "-plugin-opt=-vector-library=none" + +// RUN: %clang -### -fveclib=Accelerate -flto %s -v 2>&1 | FileCheck -check-prefix CHECK-LTO-ACCELERATE %s +// CHECK-LTO-ACCELERATE: "-plugin-opt=-vector-library=Accelerate" + +// RUN: %clang -### -fveclib=LIBMVEC -flto %s -v 2>&1 | FileCheck -check-prefix CHECK-LTO-LIBMVEC %s +// CHECK-LTO-LIBMVEC: "-plugin-opt=-vector-library=LIBMVEC-X86" + +// RUN: %clang -### -fveclib=MASSV -flto %s -v 2>&1 | FileCheck -check-prefix CHECK-LTO-MASSV %s +// CHECK-LTO-MASSV: "-plugin-opt=-vector-library=MASSV" + +// RUN: not %clang -### -fveclib=SVML -flto %s -v 2>&1 | FileCheck -check-prefix CHECK-LTO-SVML %s +// CHECK-LTO-SVML: "-plugin-opt=-vector-library=SVML" + +// RUN: %clang -### -fveclib=SLEEF -flto %s -v 2>&1 | FileCheck -check-prefix CHECK-LTO-SLEEF %s +// CHECK-LTO-SLEEF: "-plugin-opt=-vector-library=sleefgnuabi" + +// RUN: %clang -### -fveclib=Darwin_libsystem_m -flto %s -v 2>&1 | FileCheck -check-prefix CHECK-LTO-DARWIN %s +// CHECK-LTO-DARWIN: "-plugin-opt=-vector-library=Darwin_libsystem_m" + +// RUN: %clang -### -fveclib=ArmPL -flto %s -v 2>&1 | FileCheck -check-prefix CHECK-LTO-ARMPL %s david-arm wrote: Looks like `--target=aarch64-none-none` is needed for SLEEF and ArmPL perhaps? In the first 8 RUN lines it looks like we don't specify the target except for those cases. https://github.com/llvm/llvm-project/pull/78749 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [LTO] Fix Veclib flags correctly pass to LTO flags (PR #78749)
@@ -783,6 +783,28 @@ void tools::addLTOOptions(const ToolChain &ToolChain, const ArgList &Args, "-generate-arange-section")); } + // Pass vector library arguments to LTO. + Arg *ArgVecLib = Args.getLastArg(options::OPT_fveclib); + if (ArgVecLib && ArgVecLib->getNumValues() == 1) { +// Map the vector library names from clang front-end to opt front-end. The +// values are taken from the TargetLibraryInfo class command line options. +std::optional OptVal = +llvm::StringSwitch>(ArgVecLib->getValue()) david-arm wrote: Is it possible to refactor and reuse existing TargetLibraryInfo code, i.e. create a common static function that maps the values so that it can be called in multiple places? https://github.com/llvm/llvm-project/pull/78749 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [LTO] Fix Veclib flags correctly pass to LTO flags (PR #78749)
@@ -783,6 +783,28 @@ void tools::addLTOOptions(const ToolChain &ToolChain, const ArgList &Args, "-generate-arange-section")); } + // Pass vector library arguments to LTO. + Arg *ArgVecLib = Args.getLastArg(options::OPT_fveclib); + if (ArgVecLib && ArgVecLib->getNumValues() == 1) { +// Map the vector library names from clang front-end to opt front-end. The +// values are taken from the TargetLibraryInfo class command line options. +std::optional OptVal = +llvm::StringSwitch>(ArgVecLib->getValue()) david-arm wrote: Yes I think that would work, i.e. having a static function in TargetLibraryInfo.h that can be called in two places and doesn't have a dependency on the component/library. Having said that, I won't hold this patch up for this if it's too difficult! https://github.com/llvm/llvm-project/pull/78749 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [CXXNameMangler] Correct the mangling of SVE ACLE types within function names. (PR #69460)
https://github.com/david-arm approved this pull request. LGTM! An outstanding work of art @paulwalker-arm! https://github.com/llvm/llvm-project/pull/69460 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [Clang][SME2] Add multi-vector add/sub builtins (PR #69725)
https://github.com/david-arm edited https://github.com/llvm/llvm-project/pull/69725 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [Clang][SME2] Add multi-vector add/sub builtins (PR #69725)
@@ -9893,24 +9888,37 @@ Value *CodeGenFunction::FormSVEBuiltinResult(Value *Call) { return Call; } -Value *CodeGenFunction::EmitAArch64SVEBuiltinExpr(unsigned BuiltinID, - const CallExpr *E) { +void CodeGenFunction::GetAArch64SMEProcessedOperands( david-arm wrote: I wonder if actually this is better named as GetAArch64SVEProcessedOperands because if we have to choose a name that's common to both the SME and SVE builtins, choosing SVE might make more sense. That's because we're specifically dealing with scalable vectors in general here and not something that's intrinsically linked to SME. https://github.com/llvm/llvm-project/pull/69725 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [Clang][SME2] Add multi-vector add/sub builtins (PR #69725)
@@ -1016,29 +1021,24 @@ std::string Intrinsic::mangleName(ClassKind LocalCK) const { getMergeSuffix(); } -void Intrinsic::emitIntrinsic(raw_ostream &OS, SVEEmitter &Emitter) const { +void Intrinsic::emitIntrinsic(raw_ostream &OS, ACLEKind Kind) const { bool IsOverloaded = getClassKind() == ClassG && getProto().size() > 1; std::string FullName = mangleName(ClassS); std::string ProtoName = mangleName(getClassKind()); std::string SMEAttrs = ""; - if (Flags & Emitter.getEnumValueForFlag("IsStreaming")) -SMEAttrs += ", arm_streaming"; - if (Flags & Emitter.getEnumValueForFlag("IsStreamingCompatible")) -SMEAttrs += ", arm_streaming_compatible"; - if (Flags & Emitter.getEnumValueForFlag("IsSharedZA")) -SMEAttrs += ", arm_shared_za"; - if (Flags & Emitter.getEnumValueForFlag("IsPreservesZA")) -SMEAttrs += ", arm_preserves_za"; - OS << (IsOverloaded ? "__aio " : "__ai ") - << "__attribute__((__clang_arm_builtin_alias(" - << (SMEAttrs.empty() ? "__builtin_sve_" : "__builtin_sme_") - << FullName << ")"; - if (!SMEAttrs.empty()) -OS << SMEAttrs; david-arm wrote: It looks like we're no longer printing out the attributes for the builtin - is this because the attributes are dealt with explicitly elsewhere in clang and so they are no longer needed? https://github.com/llvm/llvm-project/pull/69725 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [Clang][SME2] Add multi-vector add/sub builtins (PR #69725)
https://github.com/david-arm commented: I've not done an exhaustive review, but thought I'd leave the comments I have so far! https://github.com/llvm/llvm-project/pull/69725 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [Clang][SME2] Add multi-vector add/sub builtins (PR #69725)
@@ -10272,29 +10291,13 @@ Value *CodeGenFunction::EmitAArch64SMEBuiltinExpr(unsigned BuiltinID, getContext().GetBuiltinType(BuiltinID, Error, &ICEArguments); david-arm wrote: Do we still need this code given we're now checking the ICE arguments in GetAArch64SMEProcessedOperands? https://github.com/llvm/llvm-project/pull/69725 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [Clang][SME2] Add multi-vector add/sub builtins (PR #69725)
@@ -9893,24 +9888,37 @@ Value *CodeGenFunction::FormSVEBuiltinResult(Value *Call) { return Call; } -Value *CodeGenFunction::EmitAArch64SVEBuiltinExpr(unsigned BuiltinID, - const CallExpr *E) { +void CodeGenFunction::GetAArch64SMEProcessedOperands( +unsigned BuiltinID, const CallExpr *E, SmallVectorImpl &Ops, +SVETypeFlags TypeFlags) { // Find out if any arguments are required to be integer constant expressions. unsigned ICEArguments = 0; ASTContext::GetBuiltinTypeError Error; getContext().GetBuiltinType(BuiltinID, Error, &ICEArguments); assert(Error == ASTContext::GE_None && "Should not codegen an error"); - llvm::Type *Ty = ConvertType(E->getType()); - if (BuiltinID >= SVE::BI__builtin_sve_reinterpret_s8_s8 && - BuiltinID <= SVE::BI__builtin_sve_reinterpret_f64_f64) { -Value *Val = EmitScalarExpr(E->getArg(0)); -return EmitSVEReinterpret(Val, Ty); - } + bool IsTupleGetOrSet = TypeFlags.isTupleSet() || TypeFlags.isTupleGet(); - llvm::SmallVector Ops; for (unsigned i = 0, e = E->getNumArgs(); i != e; i++) { -if ((ICEArguments & (1 << i)) == 0) +if (!IsTupleGetOrSet && (ICEArguments & (1 << i)) == 0) { david-arm wrote: Perhaps you can create a temp variable and reuse it so it's a bit clearer, i.e. ``` bool IsICE = ICEArguments & (1 << i); if (!IsTupleGetOrSet && !IsICE) { ... } else if (!IsICE) { ... ``` https://github.com/llvm/llvm-project/pull/69725 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [Clang][SME2] Add multi-vector add/sub builtins (PR #69725)
@@ -9893,24 +9888,37 @@ Value *CodeGenFunction::FormSVEBuiltinResult(Value *Call) { return Call; } -Value *CodeGenFunction::EmitAArch64SVEBuiltinExpr(unsigned BuiltinID, - const CallExpr *E) { +void CodeGenFunction::GetAArch64SMEProcessedOperands( +unsigned BuiltinID, const CallExpr *E, SmallVectorImpl &Ops, +SVETypeFlags TypeFlags) { // Find out if any arguments are required to be integer constant expressions. unsigned ICEArguments = 0; ASTContext::GetBuiltinTypeError Error; getContext().GetBuiltinType(BuiltinID, Error, &ICEArguments); assert(Error == ASTContext::GE_None && "Should not codegen an error"); - llvm::Type *Ty = ConvertType(E->getType()); - if (BuiltinID >= SVE::BI__builtin_sve_reinterpret_s8_s8 && - BuiltinID <= SVE::BI__builtin_sve_reinterpret_f64_f64) { -Value *Val = EmitScalarExpr(E->getArg(0)); -return EmitSVEReinterpret(Val, Ty); - } + bool IsTupleGetOrSet = TypeFlags.isTupleSet() || TypeFlags.isTupleGet(); - llvm::SmallVector Ops; for (unsigned i = 0, e = E->getNumArgs(); i != e; i++) { -if ((ICEArguments & (1 << i)) == 0) david-arm wrote: Might be worth adding a comment explaining why we explicitly ignore tuple get/set functions? https://github.com/llvm/llvm-project/pull/69725 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [Clang][SME2] Add multi-vector add/sub builtins (PR #69725)
https://github.com/david-arm edited https://github.com/llvm/llvm-project/pull/69725 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [Clang][SME2] Add multi-vector add/sub builtins (PR #69725)
https://github.com/david-arm commented: This looks a lot better now @kmclaughlin-arm - thanks for the changes! I just have a couple of comments about the tests that I missed previously... https://github.com/llvm/llvm-project/pull/69725 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [Clang][SME2] Add multi-vector add/sub builtins (PR #69725)
@@ -0,0 +1,418 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py + +// REQUIRES: aarch64-registered-target + +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s + +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED,A5) A1##A3##A5 +#else +#define SVE_ACLE_FUNC(A1,A2,A3,A4,A5) A1##A2##A3##A4##A5 +#endif + +// +// Single-Multi +// + +// x2 +// CHECK-LABEL: @test_svsub_write_single2_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT:[[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT:[[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT:[[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CHECK-NEXT:tail call void @llvm.aarch64.sme.sub.write.single.za.vg1x2.nxv4i32(i32 [[ADD]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT:ret void +// +// CPP-CHECK-LABEL: @_Z28test_svsub_write_single2_u32j12svuint32x2_tu12__SVUint32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT:[[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT:[[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT:[[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT:tail call void @llvm.aarch64.sme.sub.write.single.za.vg1x2.nxv4i32(i32 [[ADD]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT:ret void +// +void test_svsub_write_single2_u32(uint32_t slice_base, svuint32x2_t zn, svuint32_t zm) { + SVE_ACLE_FUNC(svsub_write,_single,_za32,_u32,_vg1x2)(slice_base + 7, zn, zm); +} + +// CHECK-LABEL: @test_svsub_write_single2_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT:[[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT:[[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) +// CHECK-NEXT:[[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) +// CHECK-NEXT:tail call void @llvm.aarch64.sme.sub.write.single.za.vg1x2.nxv2i64(i32 [[ADD]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT:ret void +// +// CPP-CHECK-LABEL: @_Z28test_svsub_write_single2_u64j12svuint64x2_tu12__SVUint64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT:[[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT:[[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT:[[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) +// CPP-CHECK-NEXT:tail call void @llvm.aarch64.sme.sub.write.single.za.vg1x2.nxv2i64(i32 [[ADD]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT:ret void +// +void test_svsub_write_single2_u64(uint32_t slice_base, svuint64x2_t zn, svuint64_t zm) { + SVE_ACLE_FUNC(svsub_write,_single,_za64,_u64,_vg1x2)(slice_base + 7, zn, zm); +} + +// x4 + +// CHECK-LABEL: @test_svsub_write_single4_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT:[[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT:[[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT:[[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CHECK-NEXT:[[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CHECK-NEXT:[[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CHECK-NEXT:tail call void @llvm.aarch64.sme.sub.write.single.za.vg1x4.nxv4i32(i32 [[ADD]], [[TMP0]],