[clang] 9a1a7d8 - [SVE] Add more warnings checks to clang and LLVM SVE tests

2020-07-07 Thread David Sherwood via cfe-commits

Author: David Sherwood
Date: 2020-07-07T09:33:20+01:00
New Revision: 9a1a7d888b53ebe5a934a8193de37da86e276f1e

URL: 
https://github.com/llvm/llvm-project/commit/9a1a7d888b53ebe5a934a8193de37da86e276f1e
DIFF: 
https://github.com/llvm/llvm-project/commit/9a1a7d888b53ebe5a934a8193de37da86e276f1e.diff

LOG: [SVE] Add more warnings checks to clang and LLVM SVE tests

There are now more SVE tests in LLVM and Clang that do not
emit warnings related to invalid use of EVT::getVectorNumElements()
and VectorType::getNumElements(). For these tests I have added
additional checks that there are no warnings in order to prevent
any future regressions.

Differential Revision: https://reviews.llvm.org/D82943

Added: 


Modified: 
clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_acge.c
clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_acgt.c
clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_acle.c
clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_aclt.c
clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_cmpeq.c
clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_cmpge.c
clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_cmpgt.c
clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_cmple.c
clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_cmplt.c
clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_cmpne.c
clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_cmpuo.c
clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_dup.c
clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_dupq.c
clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_index.c
clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld1sb.c
clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld1sh.c
clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld1sw.c
clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld1ub.c
clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld1uh.c
clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld1uw.c
clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ldff1sb.c
clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ldff1sh.c
clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ldff1sw.c
clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ldff1ub.c
clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ldff1uh.c
clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ldff1uw.c
clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ldnf1sb.c
clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ldnf1sh.c
clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ldnf1sw.c
clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ldnf1ub.c
clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ldnf1uh.c
clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ldnf1uw.c
clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_pnext.c
clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ptrue.c
clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_rev.c
clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_setffr.c
clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_trn1.c
clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_trn2.c
clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_undef.c
clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_unpkhi.c
clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_unpklo.c
clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_uzp1.c
clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_uzp2.c
clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_whilele.c
clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_whilelt.c
clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_zip1.c
clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_zip2.c
llvm/test/CodeGen/AArch64/sve-callbyref-notailcall.ll
llvm/test/CodeGen/AArch64/sve-calling-convention-byref.ll
llvm/test/CodeGen/AArch64/sve-fcmp.ll
llvm/test/CodeGen/AArch64/sve-gather-scatter-dag-combine.ll
llvm/test/CodeGen/AArch64/sve-gep.ll
llvm/test/CodeGen/AArch64/sve-int-arith-imm.ll

llvm/test/CodeGen/AArch64/sve-intrinsics-ff-gather-loads-32bit-scaled-offsets.ll

llvm/test/CodeGen/AArch64/sve-intrinsics-ff-gather-loads-32bit-unscaled-offsets.ll

llvm/test/CodeGen/AArch64/sve-intrinsics-ff-gather-loads-64bit-scaled-offset.ll

llvm/test/CodeGen/AArch64/sve-intrinsics-ff-gather-loads-64bit-unscaled-offset.ll

llvm/test/CodeGen/AArch64/sve-intrinsics-ff-gather-loads-vector-base-imm-offset.ll

llvm/test/CodeGen/AArch64/sve-intrinsics-ff-gather-loads-vector-base-scalar-offset.ll

llvm/test/CodeGen/AArch64/sve-intrinsics-gather-loads-32bit-scaled-offsets.ll

llvm/test/CodeGen/AArch64/sve-intrinsics-gather-loads-32bit-unscaled-offsets.ll
llvm/test/CodeGen/AArch64/sve-intrinsics-gather-loads-64bit-scaled-offset.ll

llvm/test/CodeGen/AArch64/sve-intrinsics-gather-loads-64bit-unscaled-offset.ll

llvm/test/CodeGen/AArch64/sve-intrinsics-gather-loads-vector-base-imm-offset.ll

llvm/test/CodeGen/AArch64/sve

[clang] ae47d15 - Remove "rm -f" workaround in acle_sve_adda.c

2020-06-26 Thread David Sherwood via cfe-commits

Author: David Sherwood
Date: 2020-06-26T08:16:40+01:00
New Revision: ae47d158a096abad43d8f9056518d83b66c5a4b7

URL: 
https://github.com/llvm/llvm-project/commit/ae47d158a096abad43d8f9056518d83b66c5a4b7
DIFF: 
https://github.com/llvm/llvm-project/commit/ae47d158a096abad43d8f9056518d83b66c5a4b7.diff

LOG: Remove "rm -f" workaround in acle_sve_adda.c

Added: 


Modified: 
clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_adda.c

Removed: 




diff  --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_adda.c 
b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_adda.c
index 9d9c33a891cd..853da8783faa 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_adda.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_adda.c
@@ -1,5 +1,4 @@
 // REQUIRES: aarch64-registered-target
-// RUN: rm -f -- %S/acle_sve_adda.s
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve 
-fallow-half-arguments-and-returns -S -O1 -Werror -Wall -emit-llvm -o - %s | 
FileCheck %s
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu 
-target-feature +sve -fallow-half-arguments-and-returns -S -O1 -Werror -Wall 
-emit-llvm -o - %s | FileCheck %s
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve 
-fallow-half-arguments-and-returns -S -O1 -Werror -Wall -o - %s >/dev/null 2>%t



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] c02332a - [CodeGen] Fix warning in getNode for EXTRACT_SUBVECTOR

2020-06-30 Thread David Sherwood via cfe-commits

Author: David Sherwood
Date: 2020-06-30T08:11:41+01:00
New Revision: c02332a69399a82244298f0097bc98fafdeb3042

URL: 
https://github.com/llvm/llvm-project/commit/c02332a69399a82244298f0097bc98fafdeb3042
DIFF: 
https://github.com/llvm/llvm-project/commit/c02332a69399a82244298f0097bc98fafdeb3042.diff

LOG: [CodeGen] Fix warning in getNode for EXTRACT_SUBVECTOR

Fix a warning in getNode() when extracting a subvector from a
concat vector. We can simply replace the call to getVectorNumElements
with getVectorMinNumElements as this follows the defined behaviour
for EXTRACT_SUBVECTOR.

Differential Revision: https://reviews.llvm.org/D82746

Added: 


Modified: 
clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_get2.c
clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_get3.c
clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_get4.c
clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st2.c
clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st3.c
clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st4.c
llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp

Removed: 




diff  --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_get2.c 
b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_get2.c
index 788bad9022b5..7beb191cab30 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_get2.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_get2.c
@@ -1,6 +1,11 @@
+// REQUIRES: aarch64-registered-target
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve 
-fallow-half-arguments-and-returns -S -O1 -Werror -Wall -emit-llvm -o - %s | 
FileCheck %s
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu 
-target-feature +sve -fallow-half-arguments-and-returns -S -O1 -Werror -Wall 
-emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve 
-fallow-half-arguments-and-returns -S -O1 -Werror -Wall -o - %s >/dev/null 2>%t
+// RUN: FileCheck --check-prefix=ASM --allow-empty %s <%t
 
+// If this check fails please read test/CodeGen/aarch64-sve-intrinsics/README 
for instructions on how to resolve it.
+// ASM-NOT: warning
 #include 
 
 #ifdef SVE_OVERLOADED_FORMS

diff  --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_get3.c 
b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_get3.c
index 502f22d84210..63e17c3e1e0f 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_get3.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_get3.c
@@ -1,6 +1,11 @@
+// REQUIRES: aarch64-registered-target
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve 
-fallow-half-arguments-and-returns -S -O1 -Werror -Wall -emit-llvm -o - %s | 
FileCheck %s
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu 
-target-feature +sve -fallow-half-arguments-and-returns -S -O1 -Werror -Wall 
-emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve 
-fallow-half-arguments-and-returns -S -O1 -Werror -Wall -o - %s >/dev/null 2>%t
+// RUN: FileCheck --check-prefix=ASM --allow-empty %s <%t
 
+// If this check fails please read test/CodeGen/aarch64-sve-intrinsics/README 
for instructions on how to resolve it.
+// ASM-NOT: warning
 #include 
 
 #ifdef SVE_OVERLOADED_FORMS

diff  --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_get4.c 
b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_get4.c
index 399fa187e83a..a34f41ff3b40 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_get4.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_get4.c
@@ -1,6 +1,11 @@
+// REQUIRES: aarch64-registered-target
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve 
-fallow-half-arguments-and-returns -S -O1 -Werror -Wall -emit-llvm -o - %s | 
FileCheck %s
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu 
-target-feature +sve -fallow-half-arguments-and-returns -S -O1 -Werror -Wall 
-emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve 
-fallow-half-arguments-and-returns -S -O1 -Werror -Wall -o - %s >/dev/null 2>%t
+// RUN: FileCheck --check-prefix=ASM --allow-empty %s <%t
 
+// If this check fails please read test/CodeGen/aarch64-sve-intrinsics/README 
for instructions on how to resolve it.
+// ASM-NOT: warning
 #include 
 
 #ifdef SVE_OVERLOADED_FORMS

diff  --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st2.c 
b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st2.c
index 7170756d7a98..de21c59bb3b7 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st2.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st2.c
@@ -1,6 +1,11 @@
+// REQUIRES: aarch64-registered-target
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve 
-fallow-half-arguments-and-returns -S -O1 -Werror -Wall -emit-llvm -o

[clang] [Clang] Emit TBAA info for enums in C (PR #73326)

2023-12-05 Thread David Sherwood via cfe-commits

david-arm wrote:

Hi @AaronBallman, yes the problem I found with always choosing `char` as the 
alias type is that LLVM will just assume that enum types alias with absolutely 
everything. This is a conservative approach that works fine, but it does 
prevent important type-based alias optimisations from happening. GCC seems to 
take advantage of the fact that enums without any explicit type should be 
compatible with an `int` (or a suitable integer that the compiler is free to 
choose that can contain all enumerated values). Consequently it can be more 
aggressive in optimisations, which is why I started looking at the C 
specification to understand the rules. If my understanding is correct, I think 
the important thing is that the type-based alias information should match the 
actual underlying type we've chosen for the enum. It could be, for example that 
in the case of an enum like this:

```
enum Foo {
  A = 1,
  B = 0x1ull
};
```

gcc chooses a `long long` and clang chooses a `long` since there seems to be 
wiggle room in the specification. I think that's fine provided the compiler is 
self-consistent. In either case both `long` and `long long` would not alias 
with `int` or `short`, which is still an improvement on treating it as a `char` 
in TBAA info.

https://github.com/llvm/llvm-project/pull/73326
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[llvm] [clang] [clang-tools-extra] [LoopVectorize] Improve algorithm for hoisting runtime checks (PR #73515)

2023-12-07 Thread David Sherwood via cfe-commits

david-arm wrote:

Gentle ping!

https://github.com/llvm/llvm-project/pull/73515
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[llvm] [clang] [Clang] Emit TBAA info for enums in C (PR #73326)

2023-12-07 Thread David Sherwood via cfe-commits

https://github.com/david-arm updated 
https://github.com/llvm/llvm-project/pull/73326

>From af76f6b6b3469fd0f5f24427c5a175c8d9d7c83a Mon Sep 17 00:00:00 2001
From: David Sherwood 
Date: Fri, 24 Nov 2023 13:20:23 +
Subject: [PATCH 1/3] [Clang] Emit TBAA info for enums in C

When emitting TBAA information for enums in C code we
currently just treat the data as an 'omnipotent char'.
However, with C strict aliasing this means we fail to
optimise certain cases. For example, in the SPEC2017
xz benchmark there are structs that contain arrays of
enums, and clang pessmistically assumes that accesses
to those enums could alias with other struct members
that have a different type.

According to

https://en.cppreference.com/w/c/language/enum

enums should be treated as 'int' types unless
explicitly specified (C23) or if 'int' would not be
large enough to hold all the enumerated values. In the
latter case the compiler is free to choose a suitable
integer that would hold all such values.

When compiling C code this patch generates TBAA
information for the enum by using an equivalent integer
of the size clang has already chosen for the enum. I
have ignored C++ for now because the rules are more
complex.

New test added here:

  clang/test/CodeGen/tbaa.c
---
 clang/lib/CodeGen/CodeGenTBAA.cpp |   5 +-
 clang/test/CodeGen/tbaa.c | 116 ++
 2 files changed, 120 insertions(+), 1 deletion(-)
 create mode 100644 clang/test/CodeGen/tbaa.c

diff --git a/clang/lib/CodeGen/CodeGenTBAA.cpp 
b/clang/lib/CodeGen/CodeGenTBAA.cpp
index 8705d3d65f1a5..f59d3d422d520 100644
--- a/clang/lib/CodeGen/CodeGenTBAA.cpp
+++ b/clang/lib/CodeGen/CodeGenTBAA.cpp
@@ -196,11 +196,14 @@ llvm::MDNode *CodeGenTBAA::getTypeInfoHelper(const Type 
*Ty) {
   // Enum types are distinct types. In C++ they have "underlying types",
   // however they aren't related for TBAA.
   if (const EnumType *ETy = dyn_cast(Ty)) {
+if (!Features.CPlusPlus)
+  return getTypeInfo(Context.getIntTypeForBitwidth(Size * 8, 0));
+
 // In C++ mode, types have linkage, so we can rely on the ODR and
 // on their mangled names, if they're external.
 // TODO: Is there a way to get a program-wide unique name for a
 // decl with local linkage or no linkage?
-if (!Features.CPlusPlus || !ETy->getDecl()->isExternallyVisible())
+if (!ETy->getDecl()->isExternallyVisible())
   return getChar();
 
 SmallString<256> OutName;
diff --git a/clang/test/CodeGen/tbaa.c b/clang/test/CodeGen/tbaa.c
new file mode 100644
index 0..0ab81f60a7194
--- /dev/null
+++ b/clang/test/CodeGen/tbaa.c
@@ -0,0 +1,116 @@
+// RUN: %clang_cc1 -triple x86_64-apple-darwin -O1 -no-struct-path-tbaa 
-disable-llvm-passes %s -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-apple-darwin -O1 -disable-llvm-passes %s 
-emit-llvm -o - | FileCheck %s -check-prefixes=PATH
+// RUN: %clang_cc1 -triple x86_64-apple-darwin -O0 -disable-llvm-passes %s 
-emit-llvm -o - | FileCheck %s -check-prefix=NO-TBAA
+// RUN: %clang_cc1 -triple x86_64-apple-darwin -O1 -relaxed-aliasing 
-disable-llvm-passes %s -emit-llvm -o - | FileCheck %s -check-prefix=NO-TBAA
+// Test TBAA metadata generated by front-end.
+//
+// NO-TBAA-NOT: !tbaa
+
+typedef unsigned char uint8_t;
+typedef unsigned short uint16_t;
+typedef unsigned int uint32_t;
+typedef unsigned long long uint64_t;
+
+typedef enum {
+  RED_AUTO_32,
+  GREEN_AUTO_32,
+  BLUE_AUTO_32
+} EnumAuto32;
+
+typedef enum {
+  RED_AUTO_64,
+  GREEN_AUTO_64,
+  BLUE_AUTO_64 = 0x1ull
+} EnumAuto64;
+
+typedef enum : uint16_t {
+  RED_16,
+  GREEN_16,
+  BLUE_16
+} Enum16;
+
+typedef enum : uint8_t {
+  RED_8,
+  GREEN_8,
+  BLUE_8
+} Enum8;
+
+uint32_t g0(EnumAuto32 *E, uint32_t *val) {
+// CHECK-LABEL: define{{.*}} i32 @g0(
+// CHECK: store i32 5, ptr %{{.*}}, align 4, !tbaa [[TAG_i32:!.*]]
+// CHECK: store i32 0, ptr %{{.*}}, align 4, !tbaa [[TAG_i32]]
+// CHECK: load i32, ptr %{{.*}}, align 4, !tbaa [[TAG_i32]]
+// PATH-LABEL: define{{.*}} i32 @g0(
+// PATH: store i32 5, ptr %{{.*}}, align 4, !tbaa [[TAG_i32:!.*]]
+// PATH: store i32 0, ptr %{{.*}}, align 4, !tbaa [[TAG_i32]]
+// PATH: load i32, ptr %{{.*}}, align 4, !tbaa [[TAG_i32]]
+  *val = 5;
+  *E = RED_AUTO_32;
+  return *val;
+}
+
+uint64_t g1(EnumAuto64 *E, uint64_t *val) {
+// CHECK-LABEL: define{{.*}} i64 @g1(
+// CHECK: store i64 5, ptr %{{.*}}, align 8, !tbaa [[TAG_i64:!.*]]
+// CHECK: store i64 0, ptr %{{.*}}, align 8, !tbaa [[TAG_long:!.*]]
+// CHECK: load i64, ptr %{{.*}}, align 8, !tbaa [[TAG_i64]]
+// PATH-LABEL: define{{.*}} i64 @g1(
+// PATH: store i64 5, ptr %{{.*}}, align 8, !tbaa [[TAG_i64:!.*]]
+// PATH: store i64 0, ptr %{{.*}}, align 8, !tbaa [[TAG_long:!.*]]
+// PATH: load i64, ptr %{{.*}}, align 8, !tbaa [[TAG_i64]]
+  *val = 5;
+  *E = RED_AUTO_64;
+  return *val;
+}
+
+uint16_t g2(Enum16 *E, uint16_t *val) {
+// CHECK-LABEL: define{{.*}} i16 @g2(
+// CHECK: store i16 5, ptr %{{.*}}, align 2, !tbaa [[TAG_i16:!.

[llvm] [clang] [Clang] Emit TBAA info for enums in C (PR #73326)

2023-12-07 Thread David Sherwood via cfe-commits

david-arm wrote:

> Do you think it's worth adding something to the Clang release note?

Done. Hope the documentation I added makes sense!

https://github.com/llvm/llvm-project/pull/73326
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [Clang] Emit TBAA info for enums in C (PR #73326)

2023-12-07 Thread David Sherwood via cfe-commits

david-arm wrote:

> I thought the suggestion was to add a few lines to 
> https://github.com/llvm/llvm-project/blob/main/clang/docs/ReleaseNotes.rst

Yes you're right! For some reason I got mixed up with the LangRef, but I guess 
adding something to the LangRef does no harm either. I'll put something in the 
release notes too. :)

https://github.com/llvm/llvm-project/pull/73326
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[llvm] [clang] [Clang] Emit TBAA info for enums in C (PR #73326)

2023-12-07 Thread David Sherwood via cfe-commits

https://github.com/david-arm updated 
https://github.com/llvm/llvm-project/pull/73326

>From af76f6b6b3469fd0f5f24427c5a175c8d9d7c83a Mon Sep 17 00:00:00 2001
From: David Sherwood 
Date: Fri, 24 Nov 2023 13:20:23 +
Subject: [PATCH 1/4] [Clang] Emit TBAA info for enums in C

When emitting TBAA information for enums in C code we
currently just treat the data as an 'omnipotent char'.
However, with C strict aliasing this means we fail to
optimise certain cases. For example, in the SPEC2017
xz benchmark there are structs that contain arrays of
enums, and clang pessmistically assumes that accesses
to those enums could alias with other struct members
that have a different type.

According to

https://en.cppreference.com/w/c/language/enum

enums should be treated as 'int' types unless
explicitly specified (C23) or if 'int' would not be
large enough to hold all the enumerated values. In the
latter case the compiler is free to choose a suitable
integer that would hold all such values.

When compiling C code this patch generates TBAA
information for the enum by using an equivalent integer
of the size clang has already chosen for the enum. I
have ignored C++ for now because the rules are more
complex.

New test added here:

  clang/test/CodeGen/tbaa.c
---
 clang/lib/CodeGen/CodeGenTBAA.cpp |   5 +-
 clang/test/CodeGen/tbaa.c | 116 ++
 2 files changed, 120 insertions(+), 1 deletion(-)
 create mode 100644 clang/test/CodeGen/tbaa.c

diff --git a/clang/lib/CodeGen/CodeGenTBAA.cpp 
b/clang/lib/CodeGen/CodeGenTBAA.cpp
index 8705d3d65f1a5..f59d3d422d520 100644
--- a/clang/lib/CodeGen/CodeGenTBAA.cpp
+++ b/clang/lib/CodeGen/CodeGenTBAA.cpp
@@ -196,11 +196,14 @@ llvm::MDNode *CodeGenTBAA::getTypeInfoHelper(const Type 
*Ty) {
   // Enum types are distinct types. In C++ they have "underlying types",
   // however they aren't related for TBAA.
   if (const EnumType *ETy = dyn_cast(Ty)) {
+if (!Features.CPlusPlus)
+  return getTypeInfo(Context.getIntTypeForBitwidth(Size * 8, 0));
+
 // In C++ mode, types have linkage, so we can rely on the ODR and
 // on their mangled names, if they're external.
 // TODO: Is there a way to get a program-wide unique name for a
 // decl with local linkage or no linkage?
-if (!Features.CPlusPlus || !ETy->getDecl()->isExternallyVisible())
+if (!ETy->getDecl()->isExternallyVisible())
   return getChar();
 
 SmallString<256> OutName;
diff --git a/clang/test/CodeGen/tbaa.c b/clang/test/CodeGen/tbaa.c
new file mode 100644
index 0..0ab81f60a7194
--- /dev/null
+++ b/clang/test/CodeGen/tbaa.c
@@ -0,0 +1,116 @@
+// RUN: %clang_cc1 -triple x86_64-apple-darwin -O1 -no-struct-path-tbaa 
-disable-llvm-passes %s -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-apple-darwin -O1 -disable-llvm-passes %s 
-emit-llvm -o - | FileCheck %s -check-prefixes=PATH
+// RUN: %clang_cc1 -triple x86_64-apple-darwin -O0 -disable-llvm-passes %s 
-emit-llvm -o - | FileCheck %s -check-prefix=NO-TBAA
+// RUN: %clang_cc1 -triple x86_64-apple-darwin -O1 -relaxed-aliasing 
-disable-llvm-passes %s -emit-llvm -o - | FileCheck %s -check-prefix=NO-TBAA
+// Test TBAA metadata generated by front-end.
+//
+// NO-TBAA-NOT: !tbaa
+
+typedef unsigned char uint8_t;
+typedef unsigned short uint16_t;
+typedef unsigned int uint32_t;
+typedef unsigned long long uint64_t;
+
+typedef enum {
+  RED_AUTO_32,
+  GREEN_AUTO_32,
+  BLUE_AUTO_32
+} EnumAuto32;
+
+typedef enum {
+  RED_AUTO_64,
+  GREEN_AUTO_64,
+  BLUE_AUTO_64 = 0x1ull
+} EnumAuto64;
+
+typedef enum : uint16_t {
+  RED_16,
+  GREEN_16,
+  BLUE_16
+} Enum16;
+
+typedef enum : uint8_t {
+  RED_8,
+  GREEN_8,
+  BLUE_8
+} Enum8;
+
+uint32_t g0(EnumAuto32 *E, uint32_t *val) {
+// CHECK-LABEL: define{{.*}} i32 @g0(
+// CHECK: store i32 5, ptr %{{.*}}, align 4, !tbaa [[TAG_i32:!.*]]
+// CHECK: store i32 0, ptr %{{.*}}, align 4, !tbaa [[TAG_i32]]
+// CHECK: load i32, ptr %{{.*}}, align 4, !tbaa [[TAG_i32]]
+// PATH-LABEL: define{{.*}} i32 @g0(
+// PATH: store i32 5, ptr %{{.*}}, align 4, !tbaa [[TAG_i32:!.*]]
+// PATH: store i32 0, ptr %{{.*}}, align 4, !tbaa [[TAG_i32]]
+// PATH: load i32, ptr %{{.*}}, align 4, !tbaa [[TAG_i32]]
+  *val = 5;
+  *E = RED_AUTO_32;
+  return *val;
+}
+
+uint64_t g1(EnumAuto64 *E, uint64_t *val) {
+// CHECK-LABEL: define{{.*}} i64 @g1(
+// CHECK: store i64 5, ptr %{{.*}}, align 8, !tbaa [[TAG_i64:!.*]]
+// CHECK: store i64 0, ptr %{{.*}}, align 8, !tbaa [[TAG_long:!.*]]
+// CHECK: load i64, ptr %{{.*}}, align 8, !tbaa [[TAG_i64]]
+// PATH-LABEL: define{{.*}} i64 @g1(
+// PATH: store i64 5, ptr %{{.*}}, align 8, !tbaa [[TAG_i64:!.*]]
+// PATH: store i64 0, ptr %{{.*}}, align 8, !tbaa [[TAG_long:!.*]]
+// PATH: load i64, ptr %{{.*}}, align 8, !tbaa [[TAG_i64]]
+  *val = 5;
+  *E = RED_AUTO_64;
+  return *val;
+}
+
+uint16_t g2(Enum16 *E, uint16_t *val) {
+// CHECK-LABEL: define{{.*}} i16 @g2(
+// CHECK: store i16 5, ptr %{{.*}}, align 2, !tbaa [[TAG_i16:!.

[clang] [llvm] [Clang] Emit TBAA info for enums in C (PR #73326)

2023-12-07 Thread David Sherwood via cfe-commits

https://github.com/david-arm updated 
https://github.com/llvm/llvm-project/pull/73326

>From af76f6b6b3469fd0f5f24427c5a175c8d9d7c83a Mon Sep 17 00:00:00 2001
From: David Sherwood 
Date: Fri, 24 Nov 2023 13:20:23 +
Subject: [PATCH 1/5] [Clang] Emit TBAA info for enums in C

When emitting TBAA information for enums in C code we
currently just treat the data as an 'omnipotent char'.
However, with C strict aliasing this means we fail to
optimise certain cases. For example, in the SPEC2017
xz benchmark there are structs that contain arrays of
enums, and clang pessmistically assumes that accesses
to those enums could alias with other struct members
that have a different type.

According to

https://en.cppreference.com/w/c/language/enum

enums should be treated as 'int' types unless
explicitly specified (C23) or if 'int' would not be
large enough to hold all the enumerated values. In the
latter case the compiler is free to choose a suitable
integer that would hold all such values.

When compiling C code this patch generates TBAA
information for the enum by using an equivalent integer
of the size clang has already chosen for the enum. I
have ignored C++ for now because the rules are more
complex.

New test added here:

  clang/test/CodeGen/tbaa.c
---
 clang/lib/CodeGen/CodeGenTBAA.cpp |   5 +-
 clang/test/CodeGen/tbaa.c | 116 ++
 2 files changed, 120 insertions(+), 1 deletion(-)
 create mode 100644 clang/test/CodeGen/tbaa.c

diff --git a/clang/lib/CodeGen/CodeGenTBAA.cpp 
b/clang/lib/CodeGen/CodeGenTBAA.cpp
index 8705d3d65f1a5..f59d3d422d520 100644
--- a/clang/lib/CodeGen/CodeGenTBAA.cpp
+++ b/clang/lib/CodeGen/CodeGenTBAA.cpp
@@ -196,11 +196,14 @@ llvm::MDNode *CodeGenTBAA::getTypeInfoHelper(const Type 
*Ty) {
   // Enum types are distinct types. In C++ they have "underlying types",
   // however they aren't related for TBAA.
   if (const EnumType *ETy = dyn_cast(Ty)) {
+if (!Features.CPlusPlus)
+  return getTypeInfo(Context.getIntTypeForBitwidth(Size * 8, 0));
+
 // In C++ mode, types have linkage, so we can rely on the ODR and
 // on their mangled names, if they're external.
 // TODO: Is there a way to get a program-wide unique name for a
 // decl with local linkage or no linkage?
-if (!Features.CPlusPlus || !ETy->getDecl()->isExternallyVisible())
+if (!ETy->getDecl()->isExternallyVisible())
   return getChar();
 
 SmallString<256> OutName;
diff --git a/clang/test/CodeGen/tbaa.c b/clang/test/CodeGen/tbaa.c
new file mode 100644
index 0..0ab81f60a7194
--- /dev/null
+++ b/clang/test/CodeGen/tbaa.c
@@ -0,0 +1,116 @@
+// RUN: %clang_cc1 -triple x86_64-apple-darwin -O1 -no-struct-path-tbaa 
-disable-llvm-passes %s -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-apple-darwin -O1 -disable-llvm-passes %s 
-emit-llvm -o - | FileCheck %s -check-prefixes=PATH
+// RUN: %clang_cc1 -triple x86_64-apple-darwin -O0 -disable-llvm-passes %s 
-emit-llvm -o - | FileCheck %s -check-prefix=NO-TBAA
+// RUN: %clang_cc1 -triple x86_64-apple-darwin -O1 -relaxed-aliasing 
-disable-llvm-passes %s -emit-llvm -o - | FileCheck %s -check-prefix=NO-TBAA
+// Test TBAA metadata generated by front-end.
+//
+// NO-TBAA-NOT: !tbaa
+
+typedef unsigned char uint8_t;
+typedef unsigned short uint16_t;
+typedef unsigned int uint32_t;
+typedef unsigned long long uint64_t;
+
+typedef enum {
+  RED_AUTO_32,
+  GREEN_AUTO_32,
+  BLUE_AUTO_32
+} EnumAuto32;
+
+typedef enum {
+  RED_AUTO_64,
+  GREEN_AUTO_64,
+  BLUE_AUTO_64 = 0x1ull
+} EnumAuto64;
+
+typedef enum : uint16_t {
+  RED_16,
+  GREEN_16,
+  BLUE_16
+} Enum16;
+
+typedef enum : uint8_t {
+  RED_8,
+  GREEN_8,
+  BLUE_8
+} Enum8;
+
+uint32_t g0(EnumAuto32 *E, uint32_t *val) {
+// CHECK-LABEL: define{{.*}} i32 @g0(
+// CHECK: store i32 5, ptr %{{.*}}, align 4, !tbaa [[TAG_i32:!.*]]
+// CHECK: store i32 0, ptr %{{.*}}, align 4, !tbaa [[TAG_i32]]
+// CHECK: load i32, ptr %{{.*}}, align 4, !tbaa [[TAG_i32]]
+// PATH-LABEL: define{{.*}} i32 @g0(
+// PATH: store i32 5, ptr %{{.*}}, align 4, !tbaa [[TAG_i32:!.*]]
+// PATH: store i32 0, ptr %{{.*}}, align 4, !tbaa [[TAG_i32]]
+// PATH: load i32, ptr %{{.*}}, align 4, !tbaa [[TAG_i32]]
+  *val = 5;
+  *E = RED_AUTO_32;
+  return *val;
+}
+
+uint64_t g1(EnumAuto64 *E, uint64_t *val) {
+// CHECK-LABEL: define{{.*}} i64 @g1(
+// CHECK: store i64 5, ptr %{{.*}}, align 8, !tbaa [[TAG_i64:!.*]]
+// CHECK: store i64 0, ptr %{{.*}}, align 8, !tbaa [[TAG_long:!.*]]
+// CHECK: load i64, ptr %{{.*}}, align 8, !tbaa [[TAG_i64]]
+// PATH-LABEL: define{{.*}} i64 @g1(
+// PATH: store i64 5, ptr %{{.*}}, align 8, !tbaa [[TAG_i64:!.*]]
+// PATH: store i64 0, ptr %{{.*}}, align 8, !tbaa [[TAG_long:!.*]]
+// PATH: load i64, ptr %{{.*}}, align 8, !tbaa [[TAG_i64]]
+  *val = 5;
+  *E = RED_AUTO_64;
+  return *val;
+}
+
+uint16_t g2(Enum16 *E, uint16_t *val) {
+// CHECK-LABEL: define{{.*}} i16 @g2(
+// CHECK: store i16 5, ptr %{{.*}}, align 2, !tbaa [[TAG_i16:!.

[llvm] [clang] [Clang] Emit TBAA info for enums in C (PR #73326)

2023-12-07 Thread David Sherwood via cfe-commits


@@ -196,6 +196,9 @@ C Language Changes
   number of elements in the flexible array member. This information can improve
   the results of the array bound sanitizer and the
   ``__builtin_dynamic_object_size`` builtin.
+- Enums will now be represented in TBAA metadata using their actual underlying

david-arm wrote:

The comment is actually under the heading `C Language Changes` so I think that 
should be clear enough. Is that ok?

https://github.com/llvm/llvm-project/pull/73326
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [Clang] Emit TBAA info for enums in C (PR #73326)

2023-12-08 Thread David Sherwood via cfe-commits

https://github.com/david-arm closed 
https://github.com/llvm/llvm-project/pull/73326
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[llvm] [clang] [clang-tools-extra] [LoopVectorize] Improve algorithm for hoisting runtime checks (PR #73515)

2023-12-08 Thread David Sherwood via cfe-commits


@@ -346,7 +346,9 @@ void RuntimePointerChecking::tryToCreateDiffCheck(
 auto *SinkStartAR = cast(SinkStartInt);
 const Loop *StartARLoop = SrcStartAR->getLoop();
 if (StartARLoop == SinkStartAR->getLoop() &&
-StartARLoop == InnerLoop->getParentLoop()) {
+StartARLoop == InnerLoop->getParentLoop() &&
+SrcStartAR->getStepRecurrence(*SE) !=

david-arm wrote:

Hi @fhahn, sorry I only just saw this now! I find with github it's really easy 
to miss review comments compared to Phabricator...

https://github.com/llvm/llvm-project/pull/73515
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang-tools-extra] [clang] [llvm] [LoopVectorize] Improve algorithm for hoisting runtime checks (PR #73515)

2023-12-08 Thread David Sherwood via cfe-commits

https://github.com/david-arm updated 
https://github.com/llvm/llvm-project/pull/73515

>From 30251642f8c208c63f3f3097c337ef0d5bc633b5 Mon Sep 17 00:00:00 2001
From: David Sherwood 
Date: Mon, 27 Nov 2023 13:43:26 +
Subject: [PATCH 1/4] [LoopVectorize] Improve algorithm for hoisting runtime
 checks

When attempting to hoist runtime checks out of a loop we currently
avoid creating pointer diff checks and prefer to do expanded range
checks instead. This gives us the opportunity to hoist runtime
checks out of a loop, since these checks are loop invariant. However,
in some cases the pointer diff checks would also be loop invariant
and so will naturally get hoisted. Therefore, since diff checks are
cheaper so we should prefer to use those instead.
---
 llvm/lib/Analysis/LoopAccessAnalysis.cpp  |   5 +-
 .../LoopVectorize/runtime-checks-hoist.ll | 143 ++
 2 files changed, 121 insertions(+), 27 deletions(-)

diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp 
b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index 3d1edd5f038a25..05765223397987 100644
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -346,7 +346,10 @@ void RuntimePointerChecking::tryToCreateDiffCheck(
 auto *SinkStartAR = cast(SinkStartInt);
 const Loop *StartARLoop = SrcStartAR->getLoop();
 if (StartARLoop == SinkStartAR->getLoop() &&
-StartARLoop == InnerLoop->getParentLoop()) {
+StartARLoop == InnerLoop->getParentLoop() &&
+!SE->isKnownPredicate(ICmpInst::ICMP_EQ,
+  SrcStartAR->getStepRecurrence(*SE),
+  SinkStartAR->getStepRecurrence(*SE))) {
   LLVM_DEBUG(dbgs() << "LAA: Not creating diff runtime check, since these "
"cannot be hoisted out of the outer loop\n");
   CanUseDiffCheck = false;
diff --git a/llvm/test/Transforms/LoopVectorize/runtime-checks-hoist.ll 
b/llvm/test/Transforms/LoopVectorize/runtime-checks-hoist.ll
index 891597cbdc48a8..81702bf34e96be 100644
--- a/llvm/test/Transforms/LoopVectorize/runtime-checks-hoist.ll
+++ b/llvm/test/Transforms/LoopVectorize/runtime-checks-hoist.ll
@@ -69,11 +69,11 @@ define void @diff_checks(ptr nocapture noundef writeonly 
%dst, ptr nocapture nou
 ; CHECK-NEXT:[[TMP14:%.*]] = add nuw nsw i64 [[TMP13]], [[TMP10]]
 ; CHECK-NEXT:[[TMP15:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 
[[TMP14]]
 ; CHECK-NEXT:[[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], 
i32 0
-; CHECK-NEXT:[[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP16]], align 4, 
!alias.scope !0
+; CHECK-NEXT:[[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP16]], align 4, 
!alias.scope [[META0:![0-9]+]]
 ; CHECK-NEXT:[[TMP17:%.*]] = add nsw i64 [[TMP13]], [[TMP11]]
 ; CHECK-NEXT:[[TMP18:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 
[[TMP17]]
 ; CHECK-NEXT:[[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP18]], 
i32 0
-; CHECK-NEXT:store <4 x i32> [[WIDE_LOAD]], ptr [[TMP19]], align 4, 
!alias.scope !3, !noalias !0
+; CHECK-NEXT:store <4 x i32> [[WIDE_LOAD]], ptr [[TMP19]], align 4, 
!alias.scope [[META3:![0-9]+]], !noalias [[META0]]
 ; CHECK-NEXT:[[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:[[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label 
[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
@@ -189,12 +189,12 @@ define void @full_checks(ptr nocapture noundef %dst, ptr 
nocapture noundef reado
 ; CHECK-NEXT:[[TMP5:%.*]] = add nuw nsw i64 [[TMP4]], [[TMP3]]
 ; CHECK-NEXT:[[TMP6:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 
[[TMP5]]
 ; CHECK-NEXT:[[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0
-; CHECK-NEXT:[[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP7]], align 4, 
!alias.scope !9
+; CHECK-NEXT:[[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP7]], align 4, 
!alias.scope [[META9:![0-9]+]]
 ; CHECK-NEXT:[[TMP8:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 
[[TMP5]]
 ; CHECK-NEXT:[[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0
-; CHECK-NEXT:[[WIDE_LOAD2:%.*]] = load <4 x i32>, ptr [[TMP9]], align 4, 
!alias.scope !12, !noalias !9
+; CHECK-NEXT:[[WIDE_LOAD2:%.*]] = load <4 x i32>, ptr [[TMP9]], align 4, 
!alias.scope [[META12:![0-9]+]], !noalias [[META9]]
 ; CHECK-NEXT:[[TMP10:%.*]] = add nsw <4 x i32> [[WIDE_LOAD2]], 
[[WIDE_LOAD]]
-; CHECK-NEXT:store <4 x i32> [[TMP10]], ptr [[TMP9]], align 4, 
!alias.scope !12, !noalias !9
+; CHECK-NEXT:store <4 x i32> [[TMP10]], ptr [[TMP9]], align 4, 
!alias.scope [[META12]], !noalias [[META9]]
 ; CHECK-NEXT:[[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:[[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label 
[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
@@ -319,13 +319,13 @@ define void @full_ch

[clang-tools-extra] [clang] [llvm] [LoopVectorize] Improve algorithm for hoisting runtime checks (PR #73515)

2023-12-11 Thread David Sherwood via cfe-commits

https://github.com/david-arm updated 
https://github.com/llvm/llvm-project/pull/73515

>From 30251642f8c208c63f3f3097c337ef0d5bc633b5 Mon Sep 17 00:00:00 2001
From: David Sherwood 
Date: Mon, 27 Nov 2023 13:43:26 +
Subject: [PATCH 1/5] [LoopVectorize] Improve algorithm for hoisting runtime
 checks

When attempting to hoist runtime checks out of a loop we currently
avoid creating pointer diff checks and prefer to do expanded range
checks instead. This gives us the opportunity to hoist runtime
checks out of a loop, since these checks are loop invariant. However,
in some cases the pointer diff checks would also be loop invariant
and so will naturally get hoisted. Therefore, since diff checks are
cheaper so we should prefer to use those instead.
---
 llvm/lib/Analysis/LoopAccessAnalysis.cpp  |   5 +-
 .../LoopVectorize/runtime-checks-hoist.ll | 143 ++
 2 files changed, 121 insertions(+), 27 deletions(-)

diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp 
b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index 3d1edd5f038a25..05765223397987 100644
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -346,7 +346,10 @@ void RuntimePointerChecking::tryToCreateDiffCheck(
 auto *SinkStartAR = cast(SinkStartInt);
 const Loop *StartARLoop = SrcStartAR->getLoop();
 if (StartARLoop == SinkStartAR->getLoop() &&
-StartARLoop == InnerLoop->getParentLoop()) {
+StartARLoop == InnerLoop->getParentLoop() &&
+!SE->isKnownPredicate(ICmpInst::ICMP_EQ,
+  SrcStartAR->getStepRecurrence(*SE),
+  SinkStartAR->getStepRecurrence(*SE))) {
   LLVM_DEBUG(dbgs() << "LAA: Not creating diff runtime check, since these "
"cannot be hoisted out of the outer loop\n");
   CanUseDiffCheck = false;
diff --git a/llvm/test/Transforms/LoopVectorize/runtime-checks-hoist.ll 
b/llvm/test/Transforms/LoopVectorize/runtime-checks-hoist.ll
index 891597cbdc48a8..81702bf34e96be 100644
--- a/llvm/test/Transforms/LoopVectorize/runtime-checks-hoist.ll
+++ b/llvm/test/Transforms/LoopVectorize/runtime-checks-hoist.ll
@@ -69,11 +69,11 @@ define void @diff_checks(ptr nocapture noundef writeonly 
%dst, ptr nocapture nou
 ; CHECK-NEXT:[[TMP14:%.*]] = add nuw nsw i64 [[TMP13]], [[TMP10]]
 ; CHECK-NEXT:[[TMP15:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 
[[TMP14]]
 ; CHECK-NEXT:[[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], 
i32 0
-; CHECK-NEXT:[[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP16]], align 4, 
!alias.scope !0
+; CHECK-NEXT:[[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP16]], align 4, 
!alias.scope [[META0:![0-9]+]]
 ; CHECK-NEXT:[[TMP17:%.*]] = add nsw i64 [[TMP13]], [[TMP11]]
 ; CHECK-NEXT:[[TMP18:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 
[[TMP17]]
 ; CHECK-NEXT:[[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP18]], 
i32 0
-; CHECK-NEXT:store <4 x i32> [[WIDE_LOAD]], ptr [[TMP19]], align 4, 
!alias.scope !3, !noalias !0
+; CHECK-NEXT:store <4 x i32> [[WIDE_LOAD]], ptr [[TMP19]], align 4, 
!alias.scope [[META3:![0-9]+]], !noalias [[META0]]
 ; CHECK-NEXT:[[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:[[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label 
[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
@@ -189,12 +189,12 @@ define void @full_checks(ptr nocapture noundef %dst, ptr 
nocapture noundef reado
 ; CHECK-NEXT:[[TMP5:%.*]] = add nuw nsw i64 [[TMP4]], [[TMP3]]
 ; CHECK-NEXT:[[TMP6:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 
[[TMP5]]
 ; CHECK-NEXT:[[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0
-; CHECK-NEXT:[[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP7]], align 4, 
!alias.scope !9
+; CHECK-NEXT:[[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP7]], align 4, 
!alias.scope [[META9:![0-9]+]]
 ; CHECK-NEXT:[[TMP8:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 
[[TMP5]]
 ; CHECK-NEXT:[[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0
-; CHECK-NEXT:[[WIDE_LOAD2:%.*]] = load <4 x i32>, ptr [[TMP9]], align 4, 
!alias.scope !12, !noalias !9
+; CHECK-NEXT:[[WIDE_LOAD2:%.*]] = load <4 x i32>, ptr [[TMP9]], align 4, 
!alias.scope [[META12:![0-9]+]], !noalias [[META9]]
 ; CHECK-NEXT:[[TMP10:%.*]] = add nsw <4 x i32> [[WIDE_LOAD2]], 
[[WIDE_LOAD]]
-; CHECK-NEXT:store <4 x i32> [[TMP10]], ptr [[TMP9]], align 4, 
!alias.scope !12, !noalias !9
+; CHECK-NEXT:store <4 x i32> [[TMP10]], ptr [[TMP9]], align 4, 
!alias.scope [[META12]], !noalias [[META9]]
 ; CHECK-NEXT:[[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:[[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label 
[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
@@ -319,13 +319,13 @@ define void @full_ch

[llvm] [clang] [AArch64][SVE2.1] Add intrinsics for quadword loads/stores with unscaled offset (PR #70474)

2023-11-15 Thread David Sherwood via cfe-commits

https://github.com/david-arm commented:

Wow, this is a huge patch. :) It took me a few hours to work through all the 
tests, and it's quite possible I've missed something. However, overall it looks 
fine and I can't see any major issues. I think there is one missing test, but 
once that's fixed I'll approve it!

https://github.com/llvm/llvm-project/pull/70474
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[llvm] [clang] [AArch64][SVE2.1] Add intrinsics for quadword loads/stores with unscaled offset (PR #70474)

2023-11-15 Thread David Sherwood via cfe-commits

https://github.com/david-arm edited 
https://github.com/llvm/llvm-project/pull/70474
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [AArch64][SVE2.1] Add intrinsics for quadword loads/stores with unscaled offset (PR #70474)

2023-11-15 Thread David Sherwood via cfe-commits


@@ -0,0 +1,2503 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// REQUIRES: aarch64-registered-target
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 
-target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | 
opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 
-target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x 
c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s 
-check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu 
-target-feature +sve2p1 -target-feature +bf16 -S -disable-O0-optnone -Werror 
-Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | 
FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu 
-target-feature +sve2p1 -target-feature +bf16 -S -disable-O0-optnone -Werror 
-Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | 
FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 
-target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+
+#include 
+
+#ifdef SVE_OVERLOADED_FORMS
+// A simple used,unused... macro, long enough to represent any SVE builtin.
+#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3
+#else
+#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4
+#endif
+
+// CHECK-LABEL: @test_svld2q_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:[[TMP0:%.*]] = tail call { ,  } @llvm.aarch64.sve.ld2q.sret.nxv16i8( [[PG:%.*]], ptr 
[[BASE:%.*]])
+// CHECK-NEXT:[[TMP1:%.*]] = extractvalue { ,  } [[TMP0]], 0
+// CHECK-NEXT:[[TMP2:%.*]] = tail call  
@llvm.vector.insert.nxv32i8.nxv16i8( poison,  [[TMP1]], i64 0)
+// CHECK-NEXT:[[TMP3:%.*]] = extractvalue { ,  } [[TMP0]], 1
+// CHECK-NEXT:[[TMP4:%.*]] = tail call  
@llvm.vector.insert.nxv32i8.nxv16i8( [[TMP2]],  [[TMP3]], i64 16)
+// CHECK-NEXT:ret  [[TMP4]]
+//
+// CPP-CHECK-LABEL: @_Z14test_svld2q_u8u10__SVBool_tPKh(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:[[TMP0:%.*]] = tail call { ,  } @llvm.aarch64.sve.ld2q.sret.nxv16i8( [[PG:%.*]], 
ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT:[[TMP1:%.*]] = extractvalue { , 
 } [[TMP0]], 0
+// CPP-CHECK-NEXT:[[TMP2:%.*]] = tail call  
@llvm.vector.insert.nxv32i8.nxv16i8( poison,  [[TMP1]], i64 0)
+// CPP-CHECK-NEXT:[[TMP3:%.*]] = extractvalue { , 
 } [[TMP0]], 1
+// CPP-CHECK-NEXT:[[TMP4:%.*]] = tail call  
@llvm.vector.insert.nxv32i8.nxv16i8( [[TMP2]],  [[TMP3]], i64 16)
+// CPP-CHECK-NEXT:ret  [[TMP4]]
+//
+svuint8x2_t test_svld2q_u8(svbool_t pg, const uint8_t *base)
+{
+  return SVE_ACLE_FUNC(svld2q,,_u8,)(pg, base);
+}
+
+// CHECK-LABEL: @test_svld2q_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:[[TMP0:%.*]] = tail call { ,  } @llvm.aarch64.sve.ld2q.sret.nxv16i8( [[PG:%.*]], ptr 
[[BASE:%.*]])
+// CHECK-NEXT:[[TMP1:%.*]] = extractvalue { ,  } [[TMP0]], 0
+// CHECK-NEXT:[[TMP2:%.*]] = tail call  
@llvm.vector.insert.nxv32i8.nxv16i8( poison,  [[TMP1]], i64 0)
+// CHECK-NEXT:[[TMP3:%.*]] = extractvalue { ,  } [[TMP0]], 1
+// CHECK-NEXT:[[TMP4:%.*]] = tail call  
@llvm.vector.insert.nxv32i8.nxv16i8( [[TMP2]],  [[TMP3]], i64 16)
+// CHECK-NEXT:ret  [[TMP4]]
+//
+// CPP-CHECK-LABEL: @_Z14test_svld2q_s8u10__SVBool_tPKa(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:[[TMP0:%.*]] = tail call { ,  } @llvm.aarch64.sve.ld2q.sret.nxv16i8( [[PG:%.*]], 
ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT:[[TMP1:%.*]] = extractvalue { , 
 } [[TMP0]], 0
+// CPP-CHECK-NEXT:[[TMP2:%.*]] = tail call  
@llvm.vector.insert.nxv32i8.nxv16i8( poison,  [[TMP1]], i64 0)
+// CPP-CHECK-NEXT:[[TMP3:%.*]] = extractvalue { , 
 } [[TMP0]], 1
+// CPP-CHECK-NEXT:[[TMP4:%.*]] = tail call  
@llvm.vector.insert.nxv32i8.nxv16i8( [[TMP2]],  [[TMP3]], i64 16)
+// CPP-CHECK-NEXT:ret  [[TMP4]]
+//
+svint8x2_t test_svld2q_s8(svbool_t pg, const int8_t *base)
+{
+  return SVE_ACLE_FUNC(svld2q,,_s8,)(pg, base);
+}
+// CHECK-LABEL: @test_svld2q_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:[[TMP0:%.*]] = tail call  
@llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]])
+// CHECK-NEXT:[[TMP1:%.*]] = tail call { ,  } @llvm.aarch64.sve.ld2q.sret.nxv8i16( [[TMP0]], ptr 
[[BASE:%.*]])
+// CHECK-NEXT:[[TMP2:%.*]] = extractvalue { ,  } [[TMP1]], 0
+// CHECK-NEXT:[[TMP3:%.*]] = tail call  
@llvm.vector.insert.nxv16i16.nxv8i16( poison,  [[TMP2]], i64 0)
+// CHECK-NEXT:[[TMP4:%.*]] = extractvalue { ,  } [[TMP1]], 1
+// CHECK-NEXT:[[TMP5:%.*]] = tail call  
@llvm.vector.insert.nxv16i16.nxv8i16( [[TMP3]],  [[TMP4]], i64 8)
+// CHECK-NEXT:ret  [[TMP5]]
+//
+// CPP-CHECK-LABEL: @_Z15test_svld2q_u16u10__SVBool_tPKt(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:[[TMP0:%.*]] = tail call  
@llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]])

[clang] [llvm] [AArch64][SVE2.1] Add intrinsics for quadword loads/stores with unscaled offset (PR #70474)

2023-11-20 Thread David Sherwood via cfe-commits

https://github.com/david-arm approved this pull request.

LGTM. I think I would have preferred the patch to be split up into 3 - one for 
contiguous extending loads/truncating stores, one for structured loads/stores, 
and one for the gathers. That's why it took me so long to review this patch as 
I was constantly trying to keep all the information about each 
builtin/instruction in my head whilst reviewing the tests for correctness!

https://github.com/llvm/llvm-project/pull/70474
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [AArch64][Clang] Refactor code to emit SVE & SME builtins (PR #70959)

2023-11-02 Thread David Sherwood via cfe-commits

https://github.com/david-arm approved this pull request.

LGTM!

https://github.com/llvm/llvm-project/pull/70959
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [AArch64][SVE2.1] Add intrinsics for quadword loads/stores with unscaled offset (PR #70474)

2023-11-03 Thread David Sherwood via cfe-commits

https://github.com/david-arm commented:

Thanks for this! I've not done an exhaustive review, but I'll leave the 
comments I have so far.

https://github.com/llvm/llvm-project/pull/70474
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [AArch64][SVE2.1] Add intrinsics for quadword loads/stores with unscaled offset (PR #70474)

2023-11-03 Thread David Sherwood via cfe-commits

https://github.com/david-arm edited 
https://github.com/llvm/llvm-project/pull/70474
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [AArch64][SVE2.1] Add intrinsics for quadword loads/stores with unscaled offset (PR #70474)

2023-11-03 Thread David Sherwood via cfe-commits


@@ -9702,17 +9727,34 @@ Value *CodeGenFunction::EmitSVEMaskedStore(const 
CallExpr *E,
   auto VectorTy = cast(Ops.back()->getType());
   auto MemoryTy = llvm::ScalableVectorType::get(MemEltTy, VectorTy);
 
-  Value *Predicate = EmitSVEPredicateCast(Ops[0], MemoryTy);
+  auto PredTy = MemoryTy;
+  auto AddrMemoryTy = MemoryTy;
+  bool IsTruncatingStore = true;

david-arm wrote:

Same comment as in EmitSVEMaskedLoad. Perhaps better just to have a IsQuadStore 
boolean, since it's an exceptional case and unlikely to have commonality with 
other instructions?

https://github.com/llvm/llvm-project/pull/70474
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [AArch64][SVE2.1] Add intrinsics for quadword loads/stores with unscaled offset (PR #70474)

2023-11-03 Thread David Sherwood via cfe-commits


@@ -2614,6 +2619,37 @@ def int_aarch64_sve_ld1_pn_x4 : 
SVE2p1_Load_PN_X4_Intrinsic;
 def int_aarch64_sve_ldnt1_pn_x2 : SVE2p1_Load_PN_X2_Intrinsic;
 def int_aarch64_sve_ldnt1_pn_x4 : SVE2p1_Load_PN_X4_Intrinsic;
 
+//
+// SVE2.1 - Contiguous loads to quadword (single vector)
+//
+
+class SVE2p1_Single_Load_Quadword
+: DefaultAttrsIntrinsic<[llvm_anyvector_ty],
+[llvm_nxv1i1_ty, llvm_ptr_ty],
+[IntrReadMem]>;

david-arm wrote:

I think this should also have IntrArgMemOnly too, similar to 
AdvSIMD_1Vec_Load_Intrinsic.

https://github.com/llvm/llvm-project/pull/70474
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[llvm] [clang] [AArch64][SVE2.1] Add intrinsics for quadword loads/stores with unscaled offset (PR #70474)

2023-11-03 Thread David Sherwood via cfe-commits


@@ -9671,28 +9677,47 @@ Value *CodeGenFunction::EmitSVEMaskedLoad(const 
CallExpr *E,
   // The vector type that is returned may be different from the
   // eventual type loaded from memory.
   auto VectorTy = cast(ReturnTy);
-  auto MemoryTy = llvm::ScalableVectorType::get(MemEltTy, VectorTy);
+  llvm::ScalableVectorType *MemoryTy = nullptr;
+  llvm::ScalableVectorType *PredTy = nullptr;
+  bool IsExtendingLoad = true;

david-arm wrote:

I personally think using this variable is misleading because aarch64_sve_ld1uwq 
is actually an extending load - we're extending from 32-bit memory elements to 
128-bit integer elements. So it looks odd when we set this to false. Perhaps 
it's better to just explicitly have a variable called `IsQuadLoad` and use that 
instead rather than try to generalise this. The quad-word loads are a really 
just an exception here because we're working around the lack of a  type. So you'd have something like

  case Intrinsic::aarch64_sve_ld1uwq:
IsQuadLoad = true;
...
  default:
IsQuadLoad = false;


  Function *F =
  CGM.getIntrinsic(IntrinsicID, IsQuadLoad ? VectorTy : MemoryTy);\

...

  if (IsQuadLoad)
return Load;

https://github.com/llvm/llvm-project/pull/70474
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[llvm] [clang] [AArch64][SVE2.1] Add intrinsics for quadword loads/stores with unscaled offset (PR #70474)

2023-11-03 Thread David Sherwood via cfe-commits


@@ -9702,17 +9727,34 @@ Value *CodeGenFunction::EmitSVEMaskedStore(const 
CallExpr *E,
   auto VectorTy = cast(Ops.back()->getType());
   auto MemoryTy = llvm::ScalableVectorType::get(MemEltTy, VectorTy);
 
-  Value *Predicate = EmitSVEPredicateCast(Ops[0], MemoryTy);
+  auto PredTy = MemoryTy;
+  auto AddrMemoryTy = MemoryTy;
+  bool IsTruncatingStore = true;
+  ;

david-arm wrote:

Extra ; here

https://github.com/llvm/llvm-project/pull/70474
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[llvm] [clang] [AArch64][SVE2.1] Add intrinsics for quadword loads/stores with unscaled offset (PR #70474)

2023-11-03 Thread David Sherwood via cfe-commits


@@ -9671,28 +9677,47 @@ Value *CodeGenFunction::EmitSVEMaskedLoad(const 
CallExpr *E,
   // The vector type that is returned may be different from the
   // eventual type loaded from memory.
   auto VectorTy = cast(ReturnTy);
-  auto MemoryTy = llvm::ScalableVectorType::get(MemEltTy, VectorTy);
+  llvm::ScalableVectorType *MemoryTy = nullptr;
+  llvm::ScalableVectorType *PredTy = nullptr;
+  bool IsExtendingLoad = true;
+  switch (IntrinsicID) {
+  case Intrinsic::aarch64_sve_ld1uwq:
+  case Intrinsic::aarch64_sve_ld1udq:
+MemoryTy = llvm::ScalableVectorType::get(MemEltTy, 1);
+PredTy =
+llvm::ScalableVectorType::get(IntegerType::get(getLLVMContext(), 1), 
1);

david-arm wrote:

You can just do 
llvm::ScalableVectorType::get(Type::getInt1Ty(getLLVMContext()), 1);

https://github.com/llvm/llvm-project/pull/70474
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[llvm] [clang] [AArch64][SVE2.1] Add intrinsics for quadword loads/stores with unscaled offset (PR #70474)

2023-11-03 Thread David Sherwood via cfe-commits


@@ -2614,6 +2619,37 @@ def int_aarch64_sve_ld1_pn_x4 : 
SVE2p1_Load_PN_X4_Intrinsic;
 def int_aarch64_sve_ldnt1_pn_x2 : SVE2p1_Load_PN_X2_Intrinsic;
 def int_aarch64_sve_ldnt1_pn_x4 : SVE2p1_Load_PN_X4_Intrinsic;
 
+//
+// SVE2.1 - Contiguous loads to quadword (single vector)
+//
+
+class SVE2p1_Single_Load_Quadword
+: DefaultAttrsIntrinsic<[llvm_anyvector_ty],
+[llvm_nxv1i1_ty, llvm_ptr_ty],
+[IntrReadMem]>;
+def int_aarch64_sve_ld1uwq : SVE2p1_Single_Load_Quadword;
+def int_aarch64_sve_ld1udq : SVE2p1_Single_Load_Quadword;
+
+//
+// SVE2.1 - Contiguous store from quadword (single vector)
+//
+
+class SVE2p1_Single_Store_Quadword
+: DefaultAttrsIntrinsic<[],
+[llvm_anyvector_ty, llvm_nxv1i1_ty, llvm_ptr_ty],
+[IntrArgMemOnly]>;

david-arm wrote:

This also needs the IntrWriteMem flag otherwise we could end up incorrectly 
rescheduling stores in the wrong place.

https://github.com/llvm/llvm-project/pull/70474
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [AArch64] Cast predicate operand of SVE gather loads/scater stores to the parameter type of the intrinsic (NFC) (PR #71289)

2023-11-06 Thread David Sherwood via cfe-commits

https://github.com/david-arm approved this pull request.

LGTM!

https://github.com/llvm/llvm-project/pull/71289
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang-tools-extra] [clang] [llvm] [LoopVectorize] Improve algorithm for hoisting runtime checks (PR #73515)

2023-12-12 Thread David Sherwood via cfe-commits

https://github.com/david-arm closed 
https://github.com/llvm/llvm-project/pull/73515
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang-tools-extra] [llvm] [LoopVectorize] Enable hoisting of runtime checks by default (PR #71538)

2023-12-12 Thread David Sherwood via cfe-commits

https://github.com/david-arm updated 
https://github.com/llvm/llvm-project/pull/71538

>From 8a2af20a52fd851eaff1cfa7d50df8b994d0db0d Mon Sep 17 00:00:00 2001
From: David Sherwood 
Date: Tue, 7 Nov 2023 13:57:17 +
Subject: [PATCH 1/2] [LoopVectorize] Enable hoisting of runtime checks by
 default

With commit https://reviews.llvm.org/D152366 I introduced
functionality that permitted the hoisting of runtime memory checks
from a vectorised inner loop to the preheader of the next outer-most
loop. This is useful for benchmarks like SPEC2017's x264 where the
inner loop is vectorised and only has a small trip count. In such
cases the runtime memory checks become expensive and since the checks
never fail in the case of x264 it makes sense to do this. However,
this behaviour was controlled by the flag -hoist-runtime-checks
which was off by default.

This patch enables this flag by default for all targets, since I
believe this is a generally beneficial thing to do. I have tested
this with SPEC2017 and I see 2.3% and 2.6% improvements with x264 on
neoverse-v1 and neoverse-n1, respectively. Similarly, I saw slight
improvements in the overall geomean on both machines. The only
other notable changes were a 1% drop in the roms benchmark, which
was compensated for by a 1% improvement in fotonik3d.
---
 llvm/lib/Analysis/LoopAccessAnalysis.cpp  |  2 +-
 .../invariant-store-vectorization.ll  | 86 +-
 .../multiple-strides-vectorization.ll | 90 ---
 .../runtime-checks-difference.ll  |  2 +-
 .../LoopVectorize/runtime-checks-hoist.ll |  2 +-
 5 files changed, 124 insertions(+), 58 deletions(-)

diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp 
b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index 3d1edd5f038a25..05ca09968207fe 100644
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -142,7 +142,7 @@ static cl::opt HoistRuntimeChecks(
 "hoist-runtime-checks", cl::Hidden,
 cl::desc(
 "Hoist inner loop runtime memory checks to outer loop if possible"),
-cl::location(VectorizerParams::HoistRuntimeChecks), cl::init(false));
+cl::location(VectorizerParams::HoistRuntimeChecks), cl::init(true));
 bool VectorizerParams::HoistRuntimeChecks;
 
 bool VectorizerParams::isInterleaveForced() {
diff --git 
a/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization.ll 
b/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization.ll
index 9e36649bcf73d6..52101fda6309f6 100644
--- a/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization.ll
+++ b/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization.ll
@@ -13,9 +13,6 @@ target datalayout = 
"e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 ; address.
 
 
-; memory check is found.conflict = b[max(n-1,1)] > a && (ptr a)+1 > (ptr b)
-
-
 define i32 @inv_val_store_to_inv_address_with_reduction(ptr %a, i64 %n, ptr 
%b) {
 ; CHECK-LABEL: @inv_val_store_to_inv_address_with_reduction(
 ; CHECK-NEXT:  entry:
@@ -346,74 +343,75 @@ define i32 @multiple_uniform_stores(ptr nocapture %var1, 
ptr nocapture readonly
 ; CHECK-NEXT:[[CMP20:%.*]] = icmp eq i32 [[ITR:%.*]], 0
 ; CHECK-NEXT:br i1 [[CMP20]], label [[FOR_END10:%.*]], label 
[[FOR_COND1_PREHEADER_PREHEADER:%.*]]
 ; CHECK:   for.cond1.preheader.preheader:
-; CHECK-NEXT:[[SCEVGEP3:%.*]] = getelementptr i8, ptr [[VAR2:%.*]], i64 4
-; CHECK-NEXT:[[INVARIANT_GEP5:%.*]] = getelementptr i8, ptr [[VAR1:%.*]], 
i64 4
+; CHECK-NEXT:[[TMP0:%.*]] = add i32 [[ITR]], -1
+; CHECK-NEXT:[[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+; CHECK-NEXT:[[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2
+; CHECK-NEXT:[[TMP3:%.*]] = getelementptr i8, ptr [[VAR1:%.*]], i64 
[[TMP2]]
+; CHECK-NEXT:[[SCEVGEP:%.*]] = getelementptr i8, ptr [[TMP3]], i64 4
+; CHECK-NEXT:[[SCEVGEP2:%.*]] = getelementptr i8, ptr [[VAR2:%.*]], i64 4
 ; CHECK-NEXT:br label [[FOR_COND1_PREHEADER:%.*]]
 ; CHECK:   for.cond1.preheader:
 ; CHECK-NEXT:[[INDVARS_IV23:%.*]] = phi i64 [ [[INDVARS_IV_NEXT24:%.*]], 
[[FOR_INC8:%.*]] ], [ 0, [[FOR_COND1_PREHEADER_PREHEADER]] ]
 ; CHECK-NEXT:[[J_022:%.*]] = phi i32 [ [[J_1_LCSSA:%.*]], [[FOR_INC8]] ], 
[ 0, [[FOR_COND1_PREHEADER_PREHEADER]] ]
-; CHECK-NEXT:[[TMP0:%.*]] = shl nuw nsw i64 [[INDVARS_IV23]], 2
-; CHECK-NEXT:[[SCEVGEP:%.*]] = getelementptr i8, ptr [[VAR1]], i64 [[TMP0]]
-; CHECK-NEXT:[[GEP6:%.*]] = getelementptr i8, ptr [[INVARIANT_GEP5]], i64 
[[TMP0]]
 ; CHECK-NEXT:[[CMP218:%.*]] = icmp ult i32 [[J_022]], [[ITR]]
 ; CHECK-NEXT:br i1 [[CMP218]], label [[FOR_BODY3_LR_PH:%.*]], label 
[[FOR_INC8]]
 ; CHECK:   for.body3.lr.ph:
 ; CHECK-NEXT:[[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[VAR1]], 
i64 [[INDVARS_IV23]]
-; CHECK-NEXT:[[TMP1:%.*]] = zext i32 [[J_022]] to i64
+; CHECK-NEXT:[[TMP4:%.*]] = zext i32 [[J_022]] to i64
 ; CHECK-NEXT:[[ARRAYIDX5_PROMOTED:%.*]] = load i32, pt

[clang-tools-extra] [llvm] [clang] [AArch64] Add an AArch64 pass for loop idiom transformations (PR #72273)

2023-12-12 Thread David Sherwood via cfe-commits


@@ -0,0 +1,726 @@
+
+//===- AArch64LoopIdiomTransform.cpp - Loop idiom recognition 
-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+
+#include "AArch64LoopIdiomTransform.h"
+#include "llvm/Analysis/DomTreeUpdater.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-lit"

david-arm wrote:

Sorry, I just realised I have lost this change somehow. I'll fix it.

https://github.com/llvm/llvm-project/pull/72273
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [clang-tools-extra] [llvm] [AArch64] Add an AArch64 pass for loop idiom transformations (PR #72273)

2023-12-12 Thread David Sherwood via cfe-commits


@@ -0,0 +1,726 @@
+
+//===- AArch64LoopIdiomTransform.cpp - Loop idiom recognition 
-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+
+#include "AArch64LoopIdiomTransform.h"
+#include "llvm/Analysis/DomTreeUpdater.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-lit"
+
+static cl::opt
+DisableAll("disable-aarch64-lit-all", cl::Hidden, cl::init(false),
+   cl::desc("Disable AArch64 Loop Idiom Transform Pass."));
+
+static cl::opt DisableByteCmp(
+"disable-aarch64-lit-bytecmp", cl::Hidden, cl::init(false),
+cl::desc("Proceed with AArch64 Loop Idiom Transform Pass, but do "
+ "not convert byte-compare loop(s)."));
+
+namespace llvm {
+
+void initializeAArch64LoopIdiomTransformLegacyPassPass(PassRegistry &);
+Pass *createAArch64LoopIdiomTransformPass();
+
+} // end namespace llvm
+
+namespace {
+
+class AArch64LoopIdiomTransform {
+  Loop *CurLoop = nullptr;
+  DominatorTree *DT;
+  LoopInfo *LI;
+  const TargetTransformInfo *TTI;
+  const DataLayout *DL;
+
+public:
+  explicit AArch64LoopIdiomTransform(DominatorTree *DT, LoopInfo *LI,
+ const TargetTransformInfo *TTI,
+ const DataLayout *DL)
+  : DT(DT), LI(LI), TTI(TTI), DL(DL) {}
+
+  bool run(Loop *L);
+
+private:
+  /// \name Countable Loop Idiom Handling
+  /// @{
+
+  bool runOnCountableLoop();
+  bool runOnLoopBlock(BasicBlock *BB, const SCEV *BECount,
+  SmallVectorImpl &ExitBlocks);
+
+  bool recognizeByteCompare();
+  Value *expandFindMismatch(IRBuilder<> &Builder, GetElementPtrInst *GEPA,
+GetElementPtrInst *GEPB, Value *Start,
+Value *MaxLen);
+  void transformByteCompare(GetElementPtrInst *GEPA, GetElementPtrInst *GEPB,
+Value *MaxLen, Value *Index, Value *Start,
+bool IncIdx, BasicBlock *FoundBB,
+BasicBlock *EndBB);
+  /// @}
+};
+
+class AArch64LoopIdiomTransformLegacyPass : public LoopPass {
+public:
+  static char ID;
+
+  explicit AArch64LoopIdiomTransformLegacyPass() : LoopPass(ID) {
+initializeAArch64LoopIdiomTransformLegacyPassPass(
+*PassRegistry::getPassRegistry());
+  }
+
+  StringRef getPassName() const override {
+return "Recognize AArch64-specific loop idioms";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+AU.addRequired();
+AU.addRequired();
+AU.addRequired();
+  }
+
+  bool runOnLoop(Loop *L, LPPassManager &LPM) override;
+};
+
+bool AArch64LoopIdiomTransformLegacyPass::runOnLoop(Loop *L,
+LPPassManager &LPM) {
+
+  if (skipLoop(L))
+return false;
+
+  auto *DT = &getAnalysis().getDomTree();
+  auto *LI = &getAnalysis().getLoopInfo();
+  auto &TTI = getAnalysis().getTTI(
+  *L->getHeader()->getParent());
+  return AArch64LoopIdiomTransform(
+ DT, LI, &TTI, &L->getHeader()->getModule()->getDataLayout())
+  .run(L);
+}
+
+} // end anonymous namespace
+
+char AArch64LoopIdiomTransformLegacyPass::ID = 0;
+
+INITIALIZE_PASS_BEGIN(
+AArch64LoopIdiomTransformLegacyPass, "aarch64-lit",
+"Transform specific loop idioms into optimised vector forms", false, false)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
+INITIALIZE_PASS_DEPENDENCY(LCSSAWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_END(
+AArch64LoopIdiomTransformLegacyPass, "aarch64-lit",
+"Transform specific loop idioms into optimised vector forms", false, false)
+
+Pass *llvm::createAArch64LoopIdiomTransformPass() {
+  return new AArch64LoopIdiomTransformLegacyPass();
+}
+
+PreservedAnalyses
+AArch64LoopIdiomTransformPass::run(Loop &L, LoopAnalysisManager &AM,
+   LoopStandardAnalysisResults &AR,
+   LPMUpdater &) {
+  if (DisableAll)
+return PreservedAnalyses::all();
+
+  const auto *DL = &L.getHeader()->getModule()->getDataLayout();
+
+  AArch64LoopIdiomTransform LIT(&AR.DT, &AR.LI, &AR.TTI, DL);
+  if (!LIT.run(&L))
+return PreservedAnalyses::all();
+
+  return PreservedAnalyses::none();
+}
+
+//===-

[llvm] [clang-tools-extra] [LoopVectorize] Enable hoisting of runtime checks by default (PR #71538)

2023-12-12 Thread David Sherwood via cfe-commits

david-arm wrote:

Gentle ping! https://github.com/llvm/llvm-project/pull/73515 has now landed so 
I think this patch should be ready to go.

https://github.com/llvm/llvm-project/pull/71538
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [Clang][SME2] Add multi-vector zip & unzip builtins (PR #74841)

2023-12-12 Thread David Sherwood via cfe-commits




david-arm wrote:

For builtins that operate purely on SVE vectors I think we've used the 
convention of adding _vector_ to the test name, i.e. see 
acle_sme2_vector_rshl.c, etc. Should we do the same here?

https://github.com/llvm/llvm-project/pull/74841
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang-tools-extra] [clang] [llvm] [AArch64] Add an AArch64 pass for loop idiom transformations (PR #72273)

2023-12-13 Thread David Sherwood via cfe-commits


@@ -0,0 +1,839 @@
+//===- AArch64LoopIdiomTransform.cpp - Loop idiom recognition 
-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+//
+// This pass implements a pass that recognizes certain loop idioms and
+// transforms them into more optimized versions of the same loop. In cases
+// where this happens, it can be a significant performance win.
+//
+// We currently only recognize one loop that finds the first mismatched byte
+// in an array and returns the index, i.e. something like:
+//
+//  while (++i != n) {
+//if (a[i] != b[i])
+//  break;
+//  }
+//
+// In this example we can actually vectorize the loop despite the early exit,
+// although the loop vectorizer does not support it. It requires some extra
+// checks to deal with the possibility of faulting loads when crossing page
+// boundaries. However, even with these checks it is still profitable to do the
+// transformation.
+//
+//===--===//
+//
+// TODO List:
+//
+// * When optimizing for code size we may want to avoid some transformations.
+// * We can also support the inverse case where we scan for a matching element.
+//
+//===--===//
+
+#include "AArch64LoopIdiomTransform.h"
+#include "llvm/Analysis/DomTreeUpdater.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-loop-idiom-transform"
+
+static cl::opt
+DisableAll("disable-aarch64-lit-all", cl::Hidden, cl::init(false),
+   cl::desc("Disable AArch64 Loop Idiom Transform Pass."));
+
+static cl::opt DisableByteCmp(
+"disable-aarch64-lit-bytecmp", cl::Hidden, cl::init(false),
+cl::desc("Proceed with AArch64 Loop Idiom Transform Pass, but do "
+ "not convert byte-compare loop(s)."));
+
+static cl::opt VerifyLoops(
+"aarch64-lit-verify", cl::Hidden, cl::init(false),
+cl::desc("Verify loops generated AArch64 Loop Idiom Transform Pass."));
+
+namespace llvm {
+
+void initializeAArch64LoopIdiomTransformLegacyPassPass(PassRegistry &);
+Pass *createAArch64LoopIdiomTransformPass();
+
+} // end namespace llvm
+
+namespace {
+
+class AArch64LoopIdiomTransform {
+  Loop *CurLoop = nullptr;
+  DominatorTree *DT;
+  LoopInfo *LI;
+  const TargetTransformInfo *TTI;
+  const DataLayout *DL;
+
+public:
+  explicit AArch64LoopIdiomTransform(DominatorTree *DT, LoopInfo *LI,
+ const TargetTransformInfo *TTI,
+ const DataLayout *DL)
+  : DT(DT), LI(LI), TTI(TTI), DL(DL) {}
+
+  bool run(Loop *L);
+
+private:
+  /// \name Countable Loop Idiom Handling
+  /// @{
+
+  bool runOnCountableLoop();
+  bool runOnLoopBlock(BasicBlock *BB, const SCEV *BECount,
+  SmallVectorImpl &ExitBlocks);
+
+  bool recognizeByteCompare();
+  Value *expandFindMismatch(IRBuilder<> &Builder, GetElementPtrInst *GEPA,
+GetElementPtrInst *GEPB, Value *Start,
+Value *MaxLen);
+  void transformByteCompare(GetElementPtrInst *GEPA, GetElementPtrInst *GEPB,
+Value *MaxLen, Value *Index, Value *Start,
+bool IncIdx, BasicBlock *FoundBB,
+BasicBlock *EndBB);
+  /// @}
+};
+
+class AArch64LoopIdiomTransformLegacyPass : public LoopPass {
+public:
+  static char ID;
+
+  explicit AArch64LoopIdiomTransformLegacyPass() : LoopPass(ID) {
+initializeAArch64LoopIdiomTransformLegacyPassPass(
+*PassRegistry::getPassRegistry());
+  }
+
+  StringRef getPassName() const override {
+return "Transform AArch64-specific loop idioms";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+AU.addRequired();
+AU.addRequired();
+AU.addRequired();
+  }
+
+  bool runOnLoop(Loop *L, LPPassManager &LPM) override;
+};
+
+bool AArch64LoopIdiomTransformLegacyPass::runOnLoop(Loop *L,
+LPPassManager &LPM) {
+
+  if (skipLoop(L))
+return false;
+
+  auto *DT = &getAnalysis().getDomTree();
+  auto *LI = &getAnalysis().getLoopInfo();
+  auto &TTI = getAnalysis().getTTI(
+  *L->getHeader()->getParent());
+  return AArch64LoopIdiomTransform(
+ DT, LI, &TTI, &L->getHeader()->getModule()->getDataLayout())
+  .run(L);
+}
+
+} // end anonymou

[clang-tools-extra] [clang] [llvm] [AArch64] Add an AArch64 pass for loop idiom transformations (PR #72273)

2023-12-13 Thread David Sherwood via cfe-commits


@@ -0,0 +1,839 @@
+//===- AArch64LoopIdiomTransform.cpp - Loop idiom recognition 
-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+//
+// This pass implements a pass that recognizes certain loop idioms and
+// transforms them into more optimized versions of the same loop. In cases
+// where this happens, it can be a significant performance win.
+//
+// We currently only recognize one loop that finds the first mismatched byte
+// in an array and returns the index, i.e. something like:
+//
+//  while (++i != n) {
+//if (a[i] != b[i])
+//  break;
+//  }
+//
+// In this example we can actually vectorize the loop despite the early exit,
+// although the loop vectorizer does not support it. It requires some extra
+// checks to deal with the possibility of faulting loads when crossing page
+// boundaries. However, even with these checks it is still profitable to do the
+// transformation.
+//
+//===--===//
+//
+// TODO List:
+//
+// * When optimizing for code size we may want to avoid some transformations.
+// * We can also support the inverse case where we scan for a matching element.
+//
+//===--===//
+
+#include "AArch64LoopIdiomTransform.h"
+#include "llvm/Analysis/DomTreeUpdater.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-loop-idiom-transform"
+
+static cl::opt
+DisableAll("disable-aarch64-lit-all", cl::Hidden, cl::init(false),
+   cl::desc("Disable AArch64 Loop Idiom Transform Pass."));
+
+static cl::opt DisableByteCmp(
+"disable-aarch64-lit-bytecmp", cl::Hidden, cl::init(false),
+cl::desc("Proceed with AArch64 Loop Idiom Transform Pass, but do "
+ "not convert byte-compare loop(s)."));
+
+static cl::opt VerifyLoops(
+"aarch64-lit-verify", cl::Hidden, cl::init(false),
+cl::desc("Verify loops generated AArch64 Loop Idiom Transform Pass."));
+
+namespace llvm {
+
+void initializeAArch64LoopIdiomTransformLegacyPassPass(PassRegistry &);
+Pass *createAArch64LoopIdiomTransformPass();
+
+} // end namespace llvm
+
+namespace {
+
+class AArch64LoopIdiomTransform {
+  Loop *CurLoop = nullptr;
+  DominatorTree *DT;
+  LoopInfo *LI;
+  const TargetTransformInfo *TTI;
+  const DataLayout *DL;
+
+public:
+  explicit AArch64LoopIdiomTransform(DominatorTree *DT, LoopInfo *LI,
+ const TargetTransformInfo *TTI,
+ const DataLayout *DL)
+  : DT(DT), LI(LI), TTI(TTI), DL(DL) {}
+
+  bool run(Loop *L);
+
+private:
+  /// \name Countable Loop Idiom Handling
+  /// @{
+
+  bool runOnCountableLoop();
+  bool runOnLoopBlock(BasicBlock *BB, const SCEV *BECount,
+  SmallVectorImpl &ExitBlocks);
+
+  bool recognizeByteCompare();
+  Value *expandFindMismatch(IRBuilder<> &Builder, GetElementPtrInst *GEPA,
+GetElementPtrInst *GEPB, Value *Start,
+Value *MaxLen);
+  void transformByteCompare(GetElementPtrInst *GEPA, GetElementPtrInst *GEPB,
+Value *MaxLen, Value *Index, Value *Start,
+bool IncIdx, BasicBlock *FoundBB,
+BasicBlock *EndBB);
+  /// @}
+};
+
+class AArch64LoopIdiomTransformLegacyPass : public LoopPass {
+public:
+  static char ID;
+
+  explicit AArch64LoopIdiomTransformLegacyPass() : LoopPass(ID) {
+initializeAArch64LoopIdiomTransformLegacyPassPass(
+*PassRegistry::getPassRegistry());
+  }
+
+  StringRef getPassName() const override {
+return "Transform AArch64-specific loop idioms";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+AU.addRequired();
+AU.addRequired();
+AU.addRequired();
+  }
+
+  bool runOnLoop(Loop *L, LPPassManager &LPM) override;
+};
+
+bool AArch64LoopIdiomTransformLegacyPass::runOnLoop(Loop *L,
+LPPassManager &LPM) {
+
+  if (skipLoop(L))
+return false;
+
+  auto *DT = &getAnalysis().getDomTree();
+  auto *LI = &getAnalysis().getLoopInfo();
+  auto &TTI = getAnalysis().getTTI(
+  *L->getHeader()->getParent());
+  return AArch64LoopIdiomTransform(
+ DT, LI, &TTI, &L->getHeader()->getModule()->getDataLayout())
+  .run(L);
+}
+
+} // end anonymou

[llvm] [clang] [clang-tools-extra] [AArch64] Add an AArch64 pass for loop idiom transformations (PR #72273)

2023-12-13 Thread David Sherwood via cfe-commits


@@ -0,0 +1,726 @@
+
+//===- AArch64LoopIdiomTransform.cpp - Loop idiom recognition 
-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+
+#include "AArch64LoopIdiomTransform.h"
+#include "llvm/Analysis/DomTreeUpdater.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-lit"
+
+static cl::opt
+DisableAll("disable-aarch64-lit-all", cl::Hidden, cl::init(false),
+   cl::desc("Disable AArch64 Loop Idiom Transform Pass."));
+
+static cl::opt DisableByteCmp(
+"disable-aarch64-lit-bytecmp", cl::Hidden, cl::init(false),
+cl::desc("Proceed with AArch64 Loop Idiom Transform Pass, but do "
+ "not convert byte-compare loop(s)."));
+
+namespace llvm {
+
+void initializeAArch64LoopIdiomTransformLegacyPassPass(PassRegistry &);
+Pass *createAArch64LoopIdiomTransformPass();
+
+} // end namespace llvm
+
+namespace {
+
+class AArch64LoopIdiomTransform {
+  Loop *CurLoop = nullptr;
+  DominatorTree *DT;
+  LoopInfo *LI;
+  const TargetTransformInfo *TTI;
+  const DataLayout *DL;
+
+public:
+  explicit AArch64LoopIdiomTransform(DominatorTree *DT, LoopInfo *LI,
+ const TargetTransformInfo *TTI,
+ const DataLayout *DL)
+  : DT(DT), LI(LI), TTI(TTI), DL(DL) {}
+
+  bool run(Loop *L);
+
+private:
+  /// \name Countable Loop Idiom Handling
+  /// @{
+
+  bool runOnCountableLoop();
+  bool runOnLoopBlock(BasicBlock *BB, const SCEV *BECount,
+  SmallVectorImpl &ExitBlocks);
+
+  bool recognizeByteCompare();
+  Value *expandFindMismatch(IRBuilder<> &Builder, GetElementPtrInst *GEPA,
+GetElementPtrInst *GEPB, Value *Start,
+Value *MaxLen);
+  void transformByteCompare(GetElementPtrInst *GEPA, GetElementPtrInst *GEPB,
+Value *MaxLen, Value *Index, Value *Start,
+bool IncIdx, BasicBlock *FoundBB,
+BasicBlock *EndBB);
+  /// @}
+};
+
+class AArch64LoopIdiomTransformLegacyPass : public LoopPass {
+public:
+  static char ID;
+
+  explicit AArch64LoopIdiomTransformLegacyPass() : LoopPass(ID) {
+initializeAArch64LoopIdiomTransformLegacyPassPass(
+*PassRegistry::getPassRegistry());
+  }
+
+  StringRef getPassName() const override {
+return "Recognize AArch64-specific loop idioms";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+AU.addRequired();
+AU.addRequired();
+AU.addRequired();
+  }
+
+  bool runOnLoop(Loop *L, LPPassManager &LPM) override;
+};
+
+bool AArch64LoopIdiomTransformLegacyPass::runOnLoop(Loop *L,
+LPPassManager &LPM) {
+
+  if (skipLoop(L))
+return false;
+
+  auto *DT = &getAnalysis().getDomTree();
+  auto *LI = &getAnalysis().getLoopInfo();
+  auto &TTI = getAnalysis().getTTI(
+  *L->getHeader()->getParent());
+  return AArch64LoopIdiomTransform(
+ DT, LI, &TTI, &L->getHeader()->getModule()->getDataLayout())
+  .run(L);
+}
+
+} // end anonymous namespace
+
+char AArch64LoopIdiomTransformLegacyPass::ID = 0;
+
+INITIALIZE_PASS_BEGIN(
+AArch64LoopIdiomTransformLegacyPass, "aarch64-lit",
+"Transform specific loop idioms into optimised vector forms", false, false)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
+INITIALIZE_PASS_DEPENDENCY(LCSSAWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_END(
+AArch64LoopIdiomTransformLegacyPass, "aarch64-lit",
+"Transform specific loop idioms into optimised vector forms", false, false)
+
+Pass *llvm::createAArch64LoopIdiomTransformPass() {
+  return new AArch64LoopIdiomTransformLegacyPass();
+}
+
+PreservedAnalyses
+AArch64LoopIdiomTransformPass::run(Loop &L, LoopAnalysisManager &AM,
+   LoopStandardAnalysisResults &AR,
+   LPMUpdater &) {
+  if (DisableAll)
+return PreservedAnalyses::all();
+
+  const auto *DL = &L.getHeader()->getModule()->getDataLayout();
+
+  AArch64LoopIdiomTransform LIT(&AR.DT, &AR.LI, &AR.TTI, DL);
+  if (!LIT.run(&L))
+return PreservedAnalyses::all();
+
+  return PreservedAnalyses::none();
+}
+
+//===-

[clang-tools-extra] [clang] [llvm] [AArch64] Add an AArch64 pass for loop idiom transformations (PR #72273)

2023-12-14 Thread David Sherwood via cfe-commits


@@ -0,0 +1,726 @@
+
+//===- AArch64LoopIdiomTransform.cpp - Loop idiom recognition 
-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+
+#include "AArch64LoopIdiomTransform.h"
+#include "llvm/Analysis/DomTreeUpdater.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-lit"
+
+static cl::opt
+DisableAll("disable-aarch64-lit-all", cl::Hidden, cl::init(false),
+   cl::desc("Disable AArch64 Loop Idiom Transform Pass."));
+
+static cl::opt DisableByteCmp(
+"disable-aarch64-lit-bytecmp", cl::Hidden, cl::init(false),
+cl::desc("Proceed with AArch64 Loop Idiom Transform Pass, but do "
+ "not convert byte-compare loop(s)."));
+
+namespace llvm {
+
+void initializeAArch64LoopIdiomTransformLegacyPassPass(PassRegistry &);
+Pass *createAArch64LoopIdiomTransformPass();
+
+} // end namespace llvm
+
+namespace {
+
+class AArch64LoopIdiomTransform {
+  Loop *CurLoop = nullptr;
+  DominatorTree *DT;
+  LoopInfo *LI;
+  const TargetTransformInfo *TTI;
+  const DataLayout *DL;
+
+public:
+  explicit AArch64LoopIdiomTransform(DominatorTree *DT, LoopInfo *LI,
+ const TargetTransformInfo *TTI,
+ const DataLayout *DL)
+  : DT(DT), LI(LI), TTI(TTI), DL(DL) {}
+
+  bool run(Loop *L);
+
+private:
+  /// \name Countable Loop Idiom Handling
+  /// @{
+
+  bool runOnCountableLoop();
+  bool runOnLoopBlock(BasicBlock *BB, const SCEV *BECount,
+  SmallVectorImpl &ExitBlocks);
+
+  bool recognizeByteCompare();
+  Value *expandFindMismatch(IRBuilder<> &Builder, GetElementPtrInst *GEPA,
+GetElementPtrInst *GEPB, Value *Start,
+Value *MaxLen);
+  void transformByteCompare(GetElementPtrInst *GEPA, GetElementPtrInst *GEPB,
+Value *MaxLen, Value *Index, Value *Start,
+bool IncIdx, BasicBlock *FoundBB,
+BasicBlock *EndBB);
+  /// @}
+};
+
+class AArch64LoopIdiomTransformLegacyPass : public LoopPass {
+public:
+  static char ID;
+
+  explicit AArch64LoopIdiomTransformLegacyPass() : LoopPass(ID) {
+initializeAArch64LoopIdiomTransformLegacyPassPass(
+*PassRegistry::getPassRegistry());
+  }
+
+  StringRef getPassName() const override {
+return "Recognize AArch64-specific loop idioms";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+AU.addRequired();
+AU.addRequired();
+AU.addRequired();
+  }
+
+  bool runOnLoop(Loop *L, LPPassManager &LPM) override;
+};
+
+bool AArch64LoopIdiomTransformLegacyPass::runOnLoop(Loop *L,
+LPPassManager &LPM) {
+
+  if (skipLoop(L))
+return false;
+
+  auto *DT = &getAnalysis().getDomTree();
+  auto *LI = &getAnalysis().getLoopInfo();
+  auto &TTI = getAnalysis().getTTI(
+  *L->getHeader()->getParent());
+  return AArch64LoopIdiomTransform(
+ DT, LI, &TTI, &L->getHeader()->getModule()->getDataLayout())
+  .run(L);
+}
+
+} // end anonymous namespace
+
+char AArch64LoopIdiomTransformLegacyPass::ID = 0;
+
+INITIALIZE_PASS_BEGIN(
+AArch64LoopIdiomTransformLegacyPass, "aarch64-lit",
+"Transform specific loop idioms into optimised vector forms", false, false)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
+INITIALIZE_PASS_DEPENDENCY(LCSSAWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_END(
+AArch64LoopIdiomTransformLegacyPass, "aarch64-lit",
+"Transform specific loop idioms into optimised vector forms", false, false)
+
+Pass *llvm::createAArch64LoopIdiomTransformPass() {
+  return new AArch64LoopIdiomTransformLegacyPass();
+}
+
+PreservedAnalyses
+AArch64LoopIdiomTransformPass::run(Loop &L, LoopAnalysisManager &AM,
+   LoopStandardAnalysisResults &AR,
+   LPMUpdater &) {
+  if (DisableAll)
+return PreservedAnalyses::all();
+
+  const auto *DL = &L.getHeader()->getModule()->getDataLayout();
+
+  AArch64LoopIdiomTransform LIT(&AR.DT, &AR.LI, &AR.TTI, DL);
+  if (!LIT.run(&L))
+return PreservedAnalyses::all();
+
+  return PreservedAnalyses::none();
+}
+
+//===-

[llvm] [clang] [Clang][SME2] Add builtins for moving multi-vectors to/from ZA (PR #71191)

2023-12-14 Thread David Sherwood via cfe-commits

https://github.com/david-arm approved this pull request.

LGTM! I had one minor comment, but I won't hold up the patch for it.

https://github.com/llvm/llvm-project/pull/71191
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [Clang][SME2] Add builtins for moving multi-vectors to/from ZA (PR #71191)

2023-12-14 Thread David Sherwood via cfe-commits

https://github.com/david-arm edited 
https://github.com/llvm/llvm-project/pull/71191
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [Clang][SME2] Add builtins for moving multi-vectors to/from ZA (PR #71191)

2023-12-14 Thread David Sherwood via cfe-commits


@@ -299,6 +299,44 @@ multiclass ZAAddSub {
 defm SVADD : ZAAddSub<"add">;
 defm SVSUB : ZAAddSub<"sub">;
 
+// SME2 - MOVA
+
+//
+// Single, 2 and 4 vector-group read/write intrinsics.
+//
+
+multiclass ZAWrite_VG checks> {
+  def NAME # _VG2_H : Inst<"svwrite_hor_" # n # "_vg2",   "vim2", t, 
MergeNone, i # "_hor_vg2", [IsSharedZA, IsStreaming], checks>;
+  def NAME # _VG2_V : Inst<"svwrite_ver_" # n # "_vg2",   "vim2", t, 
MergeNone, i # "_ver_vg2", [IsSharedZA, IsStreaming], checks>;
+  def NAME # _VG4_H : Inst<"svwrite_hor_" # n # "_vg4",   "vim4", t, 
MergeNone, i # "_hor_vg4", [IsSharedZA, IsStreaming], checks>;
+  def NAME # _VG4_V : Inst<"svwrite_ver_" # n # "_vg4",   "vim4", t, 
MergeNone, i # "_ver_vg4", [IsSharedZA, IsStreaming], checks>;
+  def NAME # _VG1x2 : Inst<"svwrite_" # n # "_vg1x2", "vm2",  t, 
MergeNone, i # "_vg1x2",   [IsSharedZA, IsStreaming], []>;
+  def NAME # _VG1x4 : Inst<"svwrite_" # n # "_vg1x4", "vm4",  t, 
MergeNone, i # "_vg1x4",   [IsSharedZA, IsStreaming], []>;
+}
+
+let TargetGuard = "sme2" in {
+  defm SVWRITE_ZA8  : ZAWrite_VG<"za8[_{d}]",  "cUc",   "aarch64_sme_write", 
[ImmCheck<0, ImmCheck0_0>]>;

david-arm wrote:

This is just a thought - is it worth pushing the `"[_{d}]"` bit into the 
multiclass given it's the same for each size, i.e.

```
  def NAME # _VG2_H : Inst<"svwrite_hor_" # n # "[_{d}]_vg2",   "vim2", t, 
MergeNone, i # "_hor_vg2", [IsSharedZA, IsStreaming], checks>;
```

and same question for the reads.

https://github.com/llvm/llvm-project/pull/71191
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[llvm] [clang-tools-extra] [LoopVectorize] Enable hoisting of runtime checks by default (PR #71538)

2023-12-18 Thread David Sherwood via cfe-commits

https://github.com/david-arm closed 
https://github.com/llvm/llvm-project/pull/71538
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [Clang] Emit TBAA info for enums in C (PR #73326)

2023-11-24 Thread David Sherwood via cfe-commits

https://github.com/david-arm created 
https://github.com/llvm/llvm-project/pull/73326

When emitting TBAA information for enums in C code we currently just treat the 
data as an 'omnipotent char'. However, with C strict aliasing this means we 
fail to optimise certain cases. For example, in the SPEC2017 xz benchmark there 
are structs that contain arrays of enums, and clang pessmistically assumes that 
accesses to those enums could alias with other struct members that have a 
different type.

According to

https://en.cppreference.com/w/c/language/enum

enums should be treated as 'int' types unless
explicitly specified (C23) or if 'int' would not be large enough to hold all 
the enumerated values. In the latter case the compiler is free to choose a 
suitable integer that would hold all such values.

When compiling C code this patch generates TBAA
information for the enum by using an equivalent integer of the size clang has 
already chosen for the enum. I have ignored C++ for now because the rules are 
more complex.

New test added here:

  clang/test/CodeGen/tbaa.c

>From af76f6b6b3469fd0f5f24427c5a175c8d9d7c83a Mon Sep 17 00:00:00 2001
From: David Sherwood 
Date: Fri, 24 Nov 2023 13:20:23 +
Subject: [PATCH] [Clang] Emit TBAA info for enums in C

When emitting TBAA information for enums in C code we
currently just treat the data as an 'omnipotent char'.
However, with C strict aliasing this means we fail to
optimise certain cases. For example, in the SPEC2017
xz benchmark there are structs that contain arrays of
enums, and clang pessmistically assumes that accesses
to those enums could alias with other struct members
that have a different type.

According to

https://en.cppreference.com/w/c/language/enum

enums should be treated as 'int' types unless
explicitly specified (C23) or if 'int' would not be
large enough to hold all the enumerated values. In the
latter case the compiler is free to choose a suitable
integer that would hold all such values.

When compiling C code this patch generates TBAA
information for the enum by using an equivalent integer
of the size clang has already chosen for the enum. I
have ignored C++ for now because the rules are more
complex.

New test added here:

  clang/test/CodeGen/tbaa.c
---
 clang/lib/CodeGen/CodeGenTBAA.cpp |   5 +-
 clang/test/CodeGen/tbaa.c | 116 ++
 2 files changed, 120 insertions(+), 1 deletion(-)
 create mode 100644 clang/test/CodeGen/tbaa.c

diff --git a/clang/lib/CodeGen/CodeGenTBAA.cpp 
b/clang/lib/CodeGen/CodeGenTBAA.cpp
index 8705d3d65f1a573..f59d3d422d5209d 100644
--- a/clang/lib/CodeGen/CodeGenTBAA.cpp
+++ b/clang/lib/CodeGen/CodeGenTBAA.cpp
@@ -196,11 +196,14 @@ llvm::MDNode *CodeGenTBAA::getTypeInfoHelper(const Type 
*Ty) {
   // Enum types are distinct types. In C++ they have "underlying types",
   // however they aren't related for TBAA.
   if (const EnumType *ETy = dyn_cast(Ty)) {
+if (!Features.CPlusPlus)
+  return getTypeInfo(Context.getIntTypeForBitwidth(Size * 8, 0));
+
 // In C++ mode, types have linkage, so we can rely on the ODR and
 // on their mangled names, if they're external.
 // TODO: Is there a way to get a program-wide unique name for a
 // decl with local linkage or no linkage?
-if (!Features.CPlusPlus || !ETy->getDecl()->isExternallyVisible())
+if (!ETy->getDecl()->isExternallyVisible())
   return getChar();
 
 SmallString<256> OutName;
diff --git a/clang/test/CodeGen/tbaa.c b/clang/test/CodeGen/tbaa.c
new file mode 100644
index 000..0ab81f60a71941c
--- /dev/null
+++ b/clang/test/CodeGen/tbaa.c
@@ -0,0 +1,116 @@
+// RUN: %clang_cc1 -triple x86_64-apple-darwin -O1 -no-struct-path-tbaa 
-disable-llvm-passes %s -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-apple-darwin -O1 -disable-llvm-passes %s 
-emit-llvm -o - | FileCheck %s -check-prefixes=PATH
+// RUN: %clang_cc1 -triple x86_64-apple-darwin -O0 -disable-llvm-passes %s 
-emit-llvm -o - | FileCheck %s -check-prefix=NO-TBAA
+// RUN: %clang_cc1 -triple x86_64-apple-darwin -O1 -relaxed-aliasing 
-disable-llvm-passes %s -emit-llvm -o - | FileCheck %s -check-prefix=NO-TBAA
+// Test TBAA metadata generated by front-end.
+//
+// NO-TBAA-NOT: !tbaa
+
+typedef unsigned char uint8_t;
+typedef unsigned short uint16_t;
+typedef unsigned int uint32_t;
+typedef unsigned long long uint64_t;
+
+typedef enum {
+  RED_AUTO_32,
+  GREEN_AUTO_32,
+  BLUE_AUTO_32
+} EnumAuto32;
+
+typedef enum {
+  RED_AUTO_64,
+  GREEN_AUTO_64,
+  BLUE_AUTO_64 = 0x1ull
+} EnumAuto64;
+
+typedef enum : uint16_t {
+  RED_16,
+  GREEN_16,
+  BLUE_16
+} Enum16;
+
+typedef enum : uint8_t {
+  RED_8,
+  GREEN_8,
+  BLUE_8
+} Enum8;
+
+uint32_t g0(EnumAuto32 *E, uint32_t *val) {
+// CHECK-LABEL: define{{.*}} i32 @g0(
+// CHECK: store i32 5, ptr %{{.*}}, align 4, !tbaa [[TAG_i32:!.*]]
+// CHECK: store i32 0, ptr %{{.*}}, align 4, !tbaa [[TAG_i32]]
+// CHECK: load i32, ptr %{{.*}}, align 4, !tbaa [[TAG_i32]

[clang] [Clang] Emit TBAA info for enums in C (PR #73326)

2023-11-24 Thread David Sherwood via cfe-commits

https://github.com/david-arm edited 
https://github.com/llvm/llvm-project/pull/73326
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [Clang] Emit TBAA info for enums in C (PR #73326)

2023-11-24 Thread David Sherwood via cfe-commits


@@ -196,11 +196,14 @@ llvm::MDNode *CodeGenTBAA::getTypeInfoHelper(const Type 
*Ty) {
   // Enum types are distinct types. In C++ they have "underlying types",
   // however they aren't related for TBAA.
   if (const EnumType *ETy = dyn_cast(Ty)) {
+if (!Features.CPlusPlus)
+  return getTypeInfo(Context.getIntTypeForBitwidth(Size * 8, 0));

david-arm wrote:

I am not sure if this is entirely correct so would appreciate some guidance 
here!

https://github.com/llvm/llvm-project/pull/73326
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [Clang] Emit TBAA info for enums in C (PR #73326)

2023-11-27 Thread David Sherwood via cfe-commits

https://github.com/david-arm updated 
https://github.com/llvm/llvm-project/pull/73326

>From af76f6b6b3469fd0f5f24427c5a175c8d9d7c83a Mon Sep 17 00:00:00 2001
From: David Sherwood 
Date: Fri, 24 Nov 2023 13:20:23 +
Subject: [PATCH 1/2] [Clang] Emit TBAA info for enums in C

When emitting TBAA information for enums in C code we
currently just treat the data as an 'omnipotent char'.
However, with C strict aliasing this means we fail to
optimise certain cases. For example, in the SPEC2017
xz benchmark there are structs that contain arrays of
enums, and clang pessmistically assumes that accesses
to those enums could alias with other struct members
that have a different type.

According to

https://en.cppreference.com/w/c/language/enum

enums should be treated as 'int' types unless
explicitly specified (C23) or if 'int' would not be
large enough to hold all the enumerated values. In the
latter case the compiler is free to choose a suitable
integer that would hold all such values.

When compiling C code this patch generates TBAA
information for the enum by using an equivalent integer
of the size clang has already chosen for the enum. I
have ignored C++ for now because the rules are more
complex.

New test added here:

  clang/test/CodeGen/tbaa.c
---
 clang/lib/CodeGen/CodeGenTBAA.cpp |   5 +-
 clang/test/CodeGen/tbaa.c | 116 ++
 2 files changed, 120 insertions(+), 1 deletion(-)
 create mode 100644 clang/test/CodeGen/tbaa.c

diff --git a/clang/lib/CodeGen/CodeGenTBAA.cpp 
b/clang/lib/CodeGen/CodeGenTBAA.cpp
index 8705d3d65f1a573..f59d3d422d5209d 100644
--- a/clang/lib/CodeGen/CodeGenTBAA.cpp
+++ b/clang/lib/CodeGen/CodeGenTBAA.cpp
@@ -196,11 +196,14 @@ llvm::MDNode *CodeGenTBAA::getTypeInfoHelper(const Type 
*Ty) {
   // Enum types are distinct types. In C++ they have "underlying types",
   // however they aren't related for TBAA.
   if (const EnumType *ETy = dyn_cast(Ty)) {
+if (!Features.CPlusPlus)
+  return getTypeInfo(Context.getIntTypeForBitwidth(Size * 8, 0));
+
 // In C++ mode, types have linkage, so we can rely on the ODR and
 // on their mangled names, if they're external.
 // TODO: Is there a way to get a program-wide unique name for a
 // decl with local linkage or no linkage?
-if (!Features.CPlusPlus || !ETy->getDecl()->isExternallyVisible())
+if (!ETy->getDecl()->isExternallyVisible())
   return getChar();
 
 SmallString<256> OutName;
diff --git a/clang/test/CodeGen/tbaa.c b/clang/test/CodeGen/tbaa.c
new file mode 100644
index 000..0ab81f60a71941c
--- /dev/null
+++ b/clang/test/CodeGen/tbaa.c
@@ -0,0 +1,116 @@
+// RUN: %clang_cc1 -triple x86_64-apple-darwin -O1 -no-struct-path-tbaa 
-disable-llvm-passes %s -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-apple-darwin -O1 -disable-llvm-passes %s 
-emit-llvm -o - | FileCheck %s -check-prefixes=PATH
+// RUN: %clang_cc1 -triple x86_64-apple-darwin -O0 -disable-llvm-passes %s 
-emit-llvm -o - | FileCheck %s -check-prefix=NO-TBAA
+// RUN: %clang_cc1 -triple x86_64-apple-darwin -O1 -relaxed-aliasing 
-disable-llvm-passes %s -emit-llvm -o - | FileCheck %s -check-prefix=NO-TBAA
+// Test TBAA metadata generated by front-end.
+//
+// NO-TBAA-NOT: !tbaa
+
+typedef unsigned char uint8_t;
+typedef unsigned short uint16_t;
+typedef unsigned int uint32_t;
+typedef unsigned long long uint64_t;
+
+typedef enum {
+  RED_AUTO_32,
+  GREEN_AUTO_32,
+  BLUE_AUTO_32
+} EnumAuto32;
+
+typedef enum {
+  RED_AUTO_64,
+  GREEN_AUTO_64,
+  BLUE_AUTO_64 = 0x1ull
+} EnumAuto64;
+
+typedef enum : uint16_t {
+  RED_16,
+  GREEN_16,
+  BLUE_16
+} Enum16;
+
+typedef enum : uint8_t {
+  RED_8,
+  GREEN_8,
+  BLUE_8
+} Enum8;
+
+uint32_t g0(EnumAuto32 *E, uint32_t *val) {
+// CHECK-LABEL: define{{.*}} i32 @g0(
+// CHECK: store i32 5, ptr %{{.*}}, align 4, !tbaa [[TAG_i32:!.*]]
+// CHECK: store i32 0, ptr %{{.*}}, align 4, !tbaa [[TAG_i32]]
+// CHECK: load i32, ptr %{{.*}}, align 4, !tbaa [[TAG_i32]]
+// PATH-LABEL: define{{.*}} i32 @g0(
+// PATH: store i32 5, ptr %{{.*}}, align 4, !tbaa [[TAG_i32:!.*]]
+// PATH: store i32 0, ptr %{{.*}}, align 4, !tbaa [[TAG_i32]]
+// PATH: load i32, ptr %{{.*}}, align 4, !tbaa [[TAG_i32]]
+  *val = 5;
+  *E = RED_AUTO_32;
+  return *val;
+}
+
+uint64_t g1(EnumAuto64 *E, uint64_t *val) {
+// CHECK-LABEL: define{{.*}} i64 @g1(
+// CHECK: store i64 5, ptr %{{.*}}, align 8, !tbaa [[TAG_i64:!.*]]
+// CHECK: store i64 0, ptr %{{.*}}, align 8, !tbaa [[TAG_long:!.*]]
+// CHECK: load i64, ptr %{{.*}}, align 8, !tbaa [[TAG_i64]]
+// PATH-LABEL: define{{.*}} i64 @g1(
+// PATH: store i64 5, ptr %{{.*}}, align 8, !tbaa [[TAG_i64:!.*]]
+// PATH: store i64 0, ptr %{{.*}}, align 8, !tbaa [[TAG_long:!.*]]
+// PATH: load i64, ptr %{{.*}}, align 8, !tbaa [[TAG_i64]]
+  *val = 5;
+  *E = RED_AUTO_64;
+  return *val;
+}
+
+uint16_t g2(Enum16 *E, uint16_t *val) {
+// CHECK-LABEL: define{{.*}} i16 @g2(
+// CHECK: store i16 5, ptr %{{.*}}, align 2, !tbaa [[TA

[clang] [Clang] Emit TBAA info for enums in C (PR #73326)

2023-11-27 Thread David Sherwood via cfe-commits


@@ -196,11 +196,14 @@ llvm::MDNode *CodeGenTBAA::getTypeInfoHelper(const Type 
*Ty) {
   // Enum types are distinct types. In C++ they have "underlying types",
   // however they aren't related for TBAA.
   if (const EnumType *ETy = dyn_cast(Ty)) {
+if (!Features.CPlusPlus)
+  return getTypeInfo(Context.getIntTypeForBitwidth(Size * 8, 0));

david-arm wrote:

Good suggestion - thanks!

https://github.com/llvm/llvm-project/pull/73326
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [AArch64][SME2] Add multi-vector SEL (x2, x4) ACLE builtins & intrinsics (PR #73188)

2023-11-28 Thread David Sherwood via cfe-commits


@@ -0,0 +1,384 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// REQUIRES: aarch64-registered-target
+
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve 
-target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | 
opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve 
-target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x 
c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s 
-check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu 
-target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall 
-emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | 
FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu 
-target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall 
-emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | 
FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve 
-target-feature +sme2 -target-feature -S -disable-O0-optnone -Werror -Wall -o 
/dev/null %s
+#include 
+
+#ifdef SVE_OVERLOADED_FORMS
+// A simple used,unused... macro, long enough to represent any SVE builtin.
+#define SVE_ACLE_FUNC(A1,A2_UNUSED) A1
+#else
+#define SVE_ACLE_FUNC(A1,A2) A1##A2
+#endif
+
+// 8-bit ZIPs

david-arm wrote:

I think this comment should say "8-bit SELs" and similarly for all the other 
comments in both the selx2 and selx4 files.

https://github.com/llvm/llvm-project/pull/73188
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [AArch64][SME2] Add multi-vector SEL (x2, x4) ACLE builtins & intrinsics (PR #73188)

2023-11-28 Thread David Sherwood via cfe-commits




david-arm wrote:

Should the file be renamed to acle_sme2_vector_selx4? This would make it 
consistent with the existing acle_sme2_vector_add.c file, which also has 
SVE-like instructions that only operate on SVE vectors.

https://github.com/llvm/llvm-project/pull/73188
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [AArch64][SME2] Add multi-vector SEL (x2, x4) ACLE builtins & intrinsics (PR #73188)

2023-11-28 Thread David Sherwood via cfe-commits




david-arm wrote:

Should the file be renamed to acle_sme2_vector_selx2? This would make it 
consistent with the existing acle_sme2_vector_add.c file, which also has 
SVE-like instructions that only operate on SVE vectors.

https://github.com/llvm/llvm-project/pull/73188
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [AArch64][SME2] Add multi-vector SEL (x2, x4) ACLE builtins & intrinsics (PR #73188)

2023-11-29 Thread David Sherwood via cfe-commits

https://github.com/david-arm approved this pull request.

LGTM! Thanks for the changes. :)

https://github.com/llvm/llvm-project/pull/73188
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang-tools-extra] [clang] [llvm] [LoopVectorize] Improve algorithm for hoisting runtime checks (PR #73515)

2023-11-30 Thread David Sherwood via cfe-commits

https://github.com/david-arm updated 
https://github.com/llvm/llvm-project/pull/73515

>From 30251642f8c208c63f3f3097c337ef0d5bc633b5 Mon Sep 17 00:00:00 2001
From: David Sherwood 
Date: Mon, 27 Nov 2023 13:43:26 +
Subject: [PATCH 1/3] [LoopVectorize] Improve algorithm for hoisting runtime
 checks

When attempting to hoist runtime checks out of a loop we currently
avoid creating pointer diff checks and prefer to do expanded range
checks instead. This gives us the opportunity to hoist runtime
checks out of a loop, since these checks are loop invariant. However,
in some cases the pointer diff checks would also be loop invariant
and so will naturally get hoisted. Therefore, since diff checks are
cheaper so we should prefer to use those instead.
---
 llvm/lib/Analysis/LoopAccessAnalysis.cpp  |   5 +-
 .../LoopVectorize/runtime-checks-hoist.ll | 143 ++
 2 files changed, 121 insertions(+), 27 deletions(-)

diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp 
b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index 3d1edd5f038a25e..057652233979876 100644
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -346,7 +346,10 @@ void RuntimePointerChecking::tryToCreateDiffCheck(
 auto *SinkStartAR = cast(SinkStartInt);
 const Loop *StartARLoop = SrcStartAR->getLoop();
 if (StartARLoop == SinkStartAR->getLoop() &&
-StartARLoop == InnerLoop->getParentLoop()) {
+StartARLoop == InnerLoop->getParentLoop() &&
+!SE->isKnownPredicate(ICmpInst::ICMP_EQ,
+  SrcStartAR->getStepRecurrence(*SE),
+  SinkStartAR->getStepRecurrence(*SE))) {
   LLVM_DEBUG(dbgs() << "LAA: Not creating diff runtime check, since these "
"cannot be hoisted out of the outer loop\n");
   CanUseDiffCheck = false;
diff --git a/llvm/test/Transforms/LoopVectorize/runtime-checks-hoist.ll 
b/llvm/test/Transforms/LoopVectorize/runtime-checks-hoist.ll
index 891597cbdc48a8f..81702bf34e96bed 100644
--- a/llvm/test/Transforms/LoopVectorize/runtime-checks-hoist.ll
+++ b/llvm/test/Transforms/LoopVectorize/runtime-checks-hoist.ll
@@ -69,11 +69,11 @@ define void @diff_checks(ptr nocapture noundef writeonly 
%dst, ptr nocapture nou
 ; CHECK-NEXT:[[TMP14:%.*]] = add nuw nsw i64 [[TMP13]], [[TMP10]]
 ; CHECK-NEXT:[[TMP15:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 
[[TMP14]]
 ; CHECK-NEXT:[[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], 
i32 0
-; CHECK-NEXT:[[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP16]], align 4, 
!alias.scope !0
+; CHECK-NEXT:[[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP16]], align 4, 
!alias.scope [[META0:![0-9]+]]
 ; CHECK-NEXT:[[TMP17:%.*]] = add nsw i64 [[TMP13]], [[TMP11]]
 ; CHECK-NEXT:[[TMP18:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 
[[TMP17]]
 ; CHECK-NEXT:[[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP18]], 
i32 0
-; CHECK-NEXT:store <4 x i32> [[WIDE_LOAD]], ptr [[TMP19]], align 4, 
!alias.scope !3, !noalias !0
+; CHECK-NEXT:store <4 x i32> [[WIDE_LOAD]], ptr [[TMP19]], align 4, 
!alias.scope [[META3:![0-9]+]], !noalias [[META0]]
 ; CHECK-NEXT:[[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:[[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label 
[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
@@ -189,12 +189,12 @@ define void @full_checks(ptr nocapture noundef %dst, ptr 
nocapture noundef reado
 ; CHECK-NEXT:[[TMP5:%.*]] = add nuw nsw i64 [[TMP4]], [[TMP3]]
 ; CHECK-NEXT:[[TMP6:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 
[[TMP5]]
 ; CHECK-NEXT:[[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0
-; CHECK-NEXT:[[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP7]], align 4, 
!alias.scope !9
+; CHECK-NEXT:[[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP7]], align 4, 
!alias.scope [[META9:![0-9]+]]
 ; CHECK-NEXT:[[TMP8:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 
[[TMP5]]
 ; CHECK-NEXT:[[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0
-; CHECK-NEXT:[[WIDE_LOAD2:%.*]] = load <4 x i32>, ptr [[TMP9]], align 4, 
!alias.scope !12, !noalias !9
+; CHECK-NEXT:[[WIDE_LOAD2:%.*]] = load <4 x i32>, ptr [[TMP9]], align 4, 
!alias.scope [[META12:![0-9]+]], !noalias [[META9]]
 ; CHECK-NEXT:[[TMP10:%.*]] = add nsw <4 x i32> [[WIDE_LOAD2]], 
[[WIDE_LOAD]]
-; CHECK-NEXT:store <4 x i32> [[TMP10]], ptr [[TMP9]], align 4, 
!alias.scope !12, !noalias !9
+; CHECK-NEXT:store <4 x i32> [[TMP10]], ptr [[TMP9]], align 4, 
!alias.scope [[META12]], !noalias [[META9]]
 ; CHECK-NEXT:[[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:[[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label 
[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
@@ -319,13 +319,13 @@ define void @ful

[clang] [llvm] [SME2] Add LUTI2 and LUTI4 quad Builtins and Intrinsics (PR #73317)

2023-11-30 Thread David Sherwood via cfe-commits


@@ -1859,6 +1867,34 @@ void AArch64DAGToDAGISel::SelectFrintFromVT(SDNode *N, 
unsigned NumVecs,
   SelectUnaryMultiIntrinsic(N, NumVecs, true, Opcode);
 }
 
+template 

david-arm wrote:

Rather than create two almost identical copies of the function with a template 
parameter, I think in this case it makes sense to just pass Max in as a 
function argument.

https://github.com/llvm/llvm-project/pull/73317
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[llvm] [clang] [SME2] Add LUTI2 and LUTI4 quad Builtins and Intrinsics (PR #73317)

2023-11-30 Thread David Sherwood via cfe-commits


@@ -1859,6 +1867,34 @@ void AArch64DAGToDAGISel::SelectFrintFromVT(SDNode *N, 
unsigned NumVecs,
   SelectUnaryMultiIntrinsic(N, NumVecs, true, Opcode);
 }
 
+template 
+void AArch64DAGToDAGISel::SelectMultiVectorLuti(SDNode *Node,
+unsigned NumOutVecs,
+unsigned Opc) {
+  if (ConstantSDNode *Imm = dyn_cast(Node->getOperand(4)))
+if (Imm->getZExtValue() > Max)
+  return;
+
+  SDValue ZtValue;
+  ImmToTile(Node->getOperand(2), ZtValue);

david-arm wrote:

If someone invokes the intrinsic with Op2 != 0 this will likely crash. Is it 
worth asserting the result of ImmToTile is true so that at least it's more 
obvious?

https://github.com/llvm/llvm-project/pull/73317
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [SME2] Add LUTI2 and LUTI4 quad Builtins and Intrinsics (PR #73317)

2023-11-30 Thread David Sherwood via cfe-commits


@@ -0,0 +1,280 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+
+// REQUIRES: aarch64-registered-target
+
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 
-target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | 
opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 
-target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x 
c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s 
-check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 
-target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+
+#include 
+
+// CHECK-LABEL: @test_svluti2_lane_zt_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:[[TMP0:%.*]] = tail call { , , ,  } 
@llvm.aarch64.sme.luti2.lane.zt.x4.nxv16i8(i32 0,  
[[ZN:%.*]], i32 0)
+// CHECK-NEXT:[[TMP1:%.*]] = extractvalue { , , ,  } [[TMP0]], 0
+// CHECK-NEXT:[[TMP2:%.*]] = tail call  
@llvm.vector.insert.nxv64i8.nxv16i8( poison,  [[TMP1]], i64 0)
+// CHECK-NEXT:[[TMP3:%.*]] = extractvalue { , , ,  } [[TMP0]], 1
+// CHECK-NEXT:[[TMP4:%.*]] = tail call  
@llvm.vector.insert.nxv64i8.nxv16i8( [[TMP2]],  [[TMP3]], i64 16)
+// CHECK-NEXT:[[TMP5:%.*]] = extractvalue { , , ,  } [[TMP0]], 2
+// CHECK-NEXT:[[TMP6:%.*]] = tail call  
@llvm.vector.insert.nxv64i8.nxv16i8( [[TMP4]],  [[TMP5]], i64 32)
+// CHECK-NEXT:[[TMP7:%.*]] = extractvalue { , , ,  } [[TMP0]], 3
+// CHECK-NEXT:[[TMP8:%.*]] = tail call  
@llvm.vector.insert.nxv64i8.nxv16i8( [[TMP6]],  [[TMP7]], i64 48)
+// CHECK-NEXT:ret  [[TMP8]]
+//
+// CPP-CHECK-LABEL: @_Z23test_svluti2_lane_zt_u8u11__SVUint8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:[[TMP0:%.*]] = tail call { , , ,  } 
@llvm.aarch64.sme.luti2.lane.zt.x4.nxv16i8(i32 0,  
[[ZN:%.*]], i32 0)
+// CPP-CHECK-NEXT:[[TMP1:%.*]] = extractvalue { , 
, ,  } [[TMP0]], 0
+// CPP-CHECK-NEXT:[[TMP2:%.*]] = tail call  
@llvm.vector.insert.nxv64i8.nxv16i8( poison,  [[TMP1]], i64 0)
+// CPP-CHECK-NEXT:[[TMP3:%.*]] = extractvalue { , 
, ,  } [[TMP0]], 1
+// CPP-CHECK-NEXT:[[TMP4:%.*]] = tail call  
@llvm.vector.insert.nxv64i8.nxv16i8( [[TMP2]],  [[TMP3]], i64 16)
+// CPP-CHECK-NEXT:[[TMP5:%.*]] = extractvalue { , 
, ,  } [[TMP0]], 2
+// CPP-CHECK-NEXT:[[TMP6:%.*]] = tail call  
@llvm.vector.insert.nxv64i8.nxv16i8( [[TMP4]],  [[TMP5]], i64 32)
+// CPP-CHECK-NEXT:[[TMP7:%.*]] = extractvalue { , 
, ,  } [[TMP0]], 3
+// CPP-CHECK-NEXT:[[TMP8:%.*]] = tail call  
@llvm.vector.insert.nxv64i8.nxv16i8( [[TMP6]],  [[TMP7]], i64 48)
+// CPP-CHECK-NEXT:ret  [[TMP8]]
+//
+svuint8x4_t test_svluti2_lane_zt_u8(svuint8_t zn) __arm_streaming 
__arm_shared_za __arm_preserves_za {

david-arm wrote:

For all of the functions in both test files shouldn't we also be testing the 
overloaded forms of the builtins?

I'd expected to see 5 RUN lines in total for each file

https://github.com/llvm/llvm-project/pull/73317
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [AArch64][SME2] Add ldr_zt, str_zt builtins and intrinsics (PR #72849)

2023-11-30 Thread David Sherwood via cfe-commits


@@ -0,0 +1,51 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+
+// REQUIRES: aarch64-registered-target
+
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S 
-disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p 
mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S 
-disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p 
mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu 
-target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | 
opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu 
-target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x 
c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s 
-check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S 
-disable-O0-optnone -Werror -Wall -o /dev/null %s
+
+#include 
+
+#ifdef SVE_OVERLOADED_FORMS

david-arm wrote:

Can delete these lines.

https://github.com/llvm/llvm-project/pull/72849
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [AArch64][SME2] Add ldr_zt, str_zt builtins and intrinsics (PR #72849)

2023-11-30 Thread David Sherwood via cfe-commits


@@ -0,0 +1,51 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+
+// REQUIRES: aarch64-registered-target
+
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S 
-disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p 
mem2reg,instcombine,tailcallelim | FileCheck %s

david-arm wrote:

I think in this case we can kill off the RUN lines for the overloaded forms 
because in the ACLE they are never overloaded.

https://github.com/llvm/llvm-project/pull/72849
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [AArch64][SME2] Add ldr_zt, str_zt builtins and intrinsics (PR #72849)

2023-11-30 Thread David Sherwood via cfe-commits


@@ -2748,6 +2748,22 @@ AArch64TargetLowering::EmitFill(MachineInstr &MI, 
MachineBasicBlock *BB) const {
   return BB;
 }
 
+MachineBasicBlock *AArch64TargetLowering::EmitZTSpillFill(MachineInstr &MI,
+  MachineBasicBlock 
*BB,
+  bool IsSpill) const {
+  const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+  MachineInstrBuilder MIB;
+  if (IsSpill) {

david-arm wrote:

I think this can be simplified to

```
  unsigned Opc = IsSpill ? AArch64::STR_TX : AArch64::LDR_TX;
  MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opc));
  MIB.addReg(MI.getOperand(0).getReg());
  MIB.add(MI.getOperand(1)); // Base
```

https://github.com/llvm/llvm-project/pull/72849
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [AArch64][SME2] Add ldr_zt, str_zt builtins and intrinsics (PR #72849)

2023-11-30 Thread David Sherwood via cfe-commits

david-arm wrote:

It looks like a few other pull requests are changing the same code around 
ImmToTile. Might be good to land this smaller patch first so you can rebase the 
others and reduce the diffs!

https://github.com/llvm/llvm-project/pull/72849
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[llvm] [clang] [AArch64][SME2] Add ldr_zt, str_zt builtins and intrinsics (PR #72849)

2023-11-30 Thread David Sherwood via cfe-commits

https://github.com/david-arm approved this pull request.

LGTM! C'est parfait!

https://github.com/llvm/llvm-project/pull/72849
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[llvm] [clang] [SME2] Add LUTI2 and LUTI4 quad Builtins and Intrinsics (PR #73317)

2023-12-01 Thread David Sherwood via cfe-commits


@@ -0,0 +1,280 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+
+// REQUIRES: aarch64-registered-target
+
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 
-target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | 
opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 
-target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x 
c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s 
-check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 
-target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+
+#include 
+
+// CHECK-LABEL: @test_svluti2_lane_zt_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:[[TMP0:%.*]] = tail call { , , ,  } 
@llvm.aarch64.sme.luti2.lane.zt.x4.nxv16i8(i32 0,  
[[ZN:%.*]], i32 0)
+// CHECK-NEXT:[[TMP1:%.*]] = extractvalue { , , ,  } [[TMP0]], 0
+// CHECK-NEXT:[[TMP2:%.*]] = tail call  
@llvm.vector.insert.nxv64i8.nxv16i8( poison,  [[TMP1]], i64 0)
+// CHECK-NEXT:[[TMP3:%.*]] = extractvalue { , , ,  } [[TMP0]], 1
+// CHECK-NEXT:[[TMP4:%.*]] = tail call  
@llvm.vector.insert.nxv64i8.nxv16i8( [[TMP2]],  [[TMP3]], i64 16)
+// CHECK-NEXT:[[TMP5:%.*]] = extractvalue { , , ,  } [[TMP0]], 2
+// CHECK-NEXT:[[TMP6:%.*]] = tail call  
@llvm.vector.insert.nxv64i8.nxv16i8( [[TMP4]],  [[TMP5]], i64 32)
+// CHECK-NEXT:[[TMP7:%.*]] = extractvalue { , , ,  } [[TMP0]], 3
+// CHECK-NEXT:[[TMP8:%.*]] = tail call  
@llvm.vector.insert.nxv64i8.nxv16i8( [[TMP6]],  [[TMP7]], i64 48)
+// CHECK-NEXT:ret  [[TMP8]]
+//
+// CPP-CHECK-LABEL: @_Z23test_svluti2_lane_zt_u8u11__SVUint8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:[[TMP0:%.*]] = tail call { , , ,  } 
@llvm.aarch64.sme.luti2.lane.zt.x4.nxv16i8(i32 0,  
[[ZN:%.*]], i32 0)
+// CPP-CHECK-NEXT:[[TMP1:%.*]] = extractvalue { , 
, ,  } [[TMP0]], 0
+// CPP-CHECK-NEXT:[[TMP2:%.*]] = tail call  
@llvm.vector.insert.nxv64i8.nxv16i8( poison,  [[TMP1]], i64 0)
+// CPP-CHECK-NEXT:[[TMP3:%.*]] = extractvalue { , 
, ,  } [[TMP0]], 1
+// CPP-CHECK-NEXT:[[TMP4:%.*]] = tail call  
@llvm.vector.insert.nxv64i8.nxv16i8( [[TMP2]],  [[TMP3]], i64 16)
+// CPP-CHECK-NEXT:[[TMP5:%.*]] = extractvalue { , 
, ,  } [[TMP0]], 2
+// CPP-CHECK-NEXT:[[TMP6:%.*]] = tail call  
@llvm.vector.insert.nxv64i8.nxv16i8( [[TMP4]],  [[TMP5]], i64 32)
+// CPP-CHECK-NEXT:[[TMP7:%.*]] = extractvalue { , 
, ,  } [[TMP0]], 3
+// CPP-CHECK-NEXT:[[TMP8:%.*]] = tail call  
@llvm.vector.insert.nxv64i8.nxv16i8( [[TMP6]],  [[TMP7]], i64 48)
+// CPP-CHECK-NEXT:ret  [[TMP8]]
+//
+svuint8x4_t test_svluti2_lane_zt_u8(svuint8_t zn) __arm_streaming 
__arm_shared_za __arm_preserves_za {

david-arm wrote:

OK that's fair enough!

https://github.com/llvm/llvm-project/pull/73317
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [SME2] Add LUTI2 and LUTI4 quad Builtins and Intrinsics (PR #73317)

2023-12-01 Thread David Sherwood via cfe-commits

https://github.com/david-arm commented:

This looks good to me, but I think it needs rebasing after 
https://github.com/llvm/llvm-project/pull/72849 landed. It also looks like 
@sdesmalen-arm left a comment about renaming ImmToTile - perhaps that could be 
done in this patch?

https://github.com/llvm/llvm-project/pull/73317
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [AArch64][SME2] Remove IsPreservesZA from ldr_zt builtin (PR #74303)

2023-12-04 Thread David Sherwood via cfe-commits


@@ -319,7 +319,7 @@ let TargetGuard = "sme2" in {
 // Spill and fill of ZT0
 //
 let TargetGuard = "sme2" in {
-  def SVLDR_ZT : Inst<"svldr_zt", "viQ", "", MergeNone, "aarch64_sme_ldr_zt", 
[IsOverloadNone, IsStreamingCompatible, IsSharedZA, IsPreservesZA], 
[ImmCheck<0, ImmCheck0_0>]>;
+  def SVLDR_ZT : Inst<"svldr_zt", "viQ", "", MergeNone, "aarch64_sme_ldr_zt", 
[IsOverloadNone, IsStreamingCompatible, IsSharedZA], [ImmCheck<0, 
ImmCheck0_0>]>;
   def SVSTR_ZT : Inst<"svstr_zt", "vi%", "", MergeNone, "aarch64_sme_str_zt", 
[IsOverloadNone, IsStreamingCompatible, IsSharedZA, IsPreservesZA], 
[ImmCheck<0, ImmCheck0_0>]>;

david-arm wrote:

Does the STR also need changing?

https://github.com/llvm/llvm-project/pull/74303
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [Clang] Emit TBAA info for enums in C (PR #73326)

2023-12-04 Thread David Sherwood via cfe-commits

david-arm wrote:

Gentle ping!

https://github.com/llvm/llvm-project/pull/73326
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [Clang][SME2] Add builtins for multi-vector fp round to integral value (PR #75941)

2023-12-21 Thread David Sherwood via cfe-commits

https://github.com/david-arm approved this pull request.

LGTM. Absolute perfection!

https://github.com/llvm/llvm-project/pull/75941
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[llvm] [clang-tools-extra] [clang] [Clang][SME2] Enable multi-vector loads & stores for SME2 (PR #75821)

2023-12-21 Thread David Sherwood via cfe-commits

https://github.com/david-arm approved this pull request.

LGTM! A lovely patch. :)

https://github.com/llvm/llvm-project/pull/75821
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [clang-tools-extra] [AArch64] Add an AArch64 pass for loop idiom transformations (PR #72273)

2023-12-22 Thread David Sherwood via cfe-commits


@@ -0,0 +1,816 @@
+//===- AArch64LoopIdiomTransform.cpp - Loop idiom recognition 
-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+//
+// This pass implements a pass that recognizes certain loop idioms and
+// transforms them into more optimized versions of the same loop. In cases
+// where this happens, it can be a significant performance win.
+//
+// We currently only recognize one loop that finds the first mismatched byte
+// in an array and returns the index, i.e. something like:
+//
+//  while (++i != n) {
+//if (a[i] != b[i])
+//  break;
+//  }
+//
+// In this example we can actually vectorize the loop despite the early exit,
+// although the loop vectorizer does not support it. It requires some extra
+// checks to deal with the possibility of faulting loads when crossing page
+// boundaries. However, even with these checks it is still profitable to do the
+// transformation.
+//
+//===--===//
+//
+// TODO List:
+//
+// * When optimizing for code size we may want to avoid some transformations.
+// * We can also support the inverse case where we scan for a matching element.
+//
+//===--===//
+
+#include "AArch64LoopIdiomTransform.h"
+#include "llvm/Analysis/DomTreeUpdater.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-loop-idiom-transform"
+
+static cl::opt
+DisableAll("disable-aarch64-lit-all", cl::Hidden, cl::init(false),
+   cl::desc("Disable AArch64 Loop Idiom Transform Pass."));
+
+static cl::opt DisableByteCmp(
+"disable-aarch64-lit-bytecmp", cl::Hidden, cl::init(false),
+cl::desc("Proceed with AArch64 Loop Idiom Transform Pass, but do "
+ "not convert byte-compare loop(s)."));
+
+static cl::opt VerifyLoops(
+"aarch64-lit-verify", cl::Hidden, cl::init(false),
+cl::desc("Verify loops generated AArch64 Loop Idiom Transform Pass."));
+
+namespace llvm {
+
+void initializeAArch64LoopIdiomTransformLegacyPassPass(PassRegistry &);
+Pass *createAArch64LoopIdiomTransformPass();
+
+} // end namespace llvm
+
+namespace {
+
+class AArch64LoopIdiomTransform {
+  Loop *CurLoop = nullptr;
+  DominatorTree *DT;
+  LoopInfo *LI;
+  const TargetTransformInfo *TTI;
+  const DataLayout *DL;
+
+public:
+  explicit AArch64LoopIdiomTransform(DominatorTree *DT, LoopInfo *LI,
+ const TargetTransformInfo *TTI,
+ const DataLayout *DL)
+  : DT(DT), LI(LI), TTI(TTI), DL(DL) {}
+
+  bool run(Loop *L);
+
+private:
+  /// \name Countable Loop Idiom Handling
+  /// @{
+
+  bool runOnCountableLoop();
+  bool runOnLoopBlock(BasicBlock *BB, const SCEV *BECount,
+  SmallVectorImpl &ExitBlocks);
+
+  bool recognizeByteCompare();
+  Value *expandFindMismatch(IRBuilder<> &Builder, GetElementPtrInst *GEPA,
+GetElementPtrInst *GEPB, Instruction *Index,
+Value *Start, Value *MaxLen);
+  void transformByteCompare(GetElementPtrInst *GEPA, GetElementPtrInst *GEPB,
+PHINode *IndPhi, Value *MaxLen, Instruction *Index,
+Value *Start, bool IncIdx, BasicBlock *FoundBB,
+BasicBlock *EndBB);
+  /// @}
+};
+
+class AArch64LoopIdiomTransformLegacyPass : public LoopPass {
+public:
+  static char ID;
+
+  explicit AArch64LoopIdiomTransformLegacyPass() : LoopPass(ID) {
+initializeAArch64LoopIdiomTransformLegacyPassPass(
+*PassRegistry::getPassRegistry());
+  }
+
+  StringRef getPassName() const override {
+return "Transform AArch64-specific loop idioms";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+AU.addRequired();
+AU.addRequired();
+AU.addRequired();
+  }
+
+  bool runOnLoop(Loop *L, LPPassManager &LPM) override;
+};
+
+bool AArch64LoopIdiomTransformLegacyPass::runOnLoop(Loop *L,
+LPPassManager &LPM) {
+
+  if (skipLoop(L))
+return false;
+
+  auto *DT = &getAnalysis().getDomTree();
+  auto *LI = &getAnalysis().getLoopInfo();
+  auto &TTI = getAnalysis().getTTI(
+  *L->getHeader()->getParent());
+  return AArch64LoopIdiomTransform(
+ DT, LI, &TTI, &L->getHeader()->getModule()->getDataLayout

[clang] [llvm] [clang-tools-extra] [AArch64] Add an AArch64 pass for loop idiom transformations (PR #72273)

2023-12-22 Thread David Sherwood via cfe-commits


@@ -0,0 +1,816 @@
+//===- AArch64LoopIdiomTransform.cpp - Loop idiom recognition 
-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+//
+// This pass implements a pass that recognizes certain loop idioms and
+// transforms them into more optimized versions of the same loop. In cases
+// where this happens, it can be a significant performance win.
+//
+// We currently only recognize one loop that finds the first mismatched byte
+// in an array and returns the index, i.e. something like:
+//
+//  while (++i != n) {
+//if (a[i] != b[i])
+//  break;
+//  }
+//
+// In this example we can actually vectorize the loop despite the early exit,
+// although the loop vectorizer does not support it. It requires some extra
+// checks to deal with the possibility of faulting loads when crossing page
+// boundaries. However, even with these checks it is still profitable to do the
+// transformation.
+//
+//===--===//
+//
+// TODO List:
+//
+// * When optimizing for code size we may want to avoid some transformations.
+// * We can also support the inverse case where we scan for a matching element.
+//
+//===--===//
+
+#include "AArch64LoopIdiomTransform.h"
+#include "llvm/Analysis/DomTreeUpdater.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-loop-idiom-transform"
+
+static cl::opt
+DisableAll("disable-aarch64-lit-all", cl::Hidden, cl::init(false),
+   cl::desc("Disable AArch64 Loop Idiom Transform Pass."));
+
+static cl::opt DisableByteCmp(
+"disable-aarch64-lit-bytecmp", cl::Hidden, cl::init(false),
+cl::desc("Proceed with AArch64 Loop Idiom Transform Pass, but do "
+ "not convert byte-compare loop(s)."));
+
+static cl::opt VerifyLoops(
+"aarch64-lit-verify", cl::Hidden, cl::init(false),
+cl::desc("Verify loops generated AArch64 Loop Idiom Transform Pass."));
+
+namespace llvm {
+
+void initializeAArch64LoopIdiomTransformLegacyPassPass(PassRegistry &);
+Pass *createAArch64LoopIdiomTransformPass();
+
+} // end namespace llvm
+
+namespace {
+
+class AArch64LoopIdiomTransform {
+  Loop *CurLoop = nullptr;
+  DominatorTree *DT;
+  LoopInfo *LI;
+  const TargetTransformInfo *TTI;
+  const DataLayout *DL;
+
+public:
+  explicit AArch64LoopIdiomTransform(DominatorTree *DT, LoopInfo *LI,
+ const TargetTransformInfo *TTI,
+ const DataLayout *DL)
+  : DT(DT), LI(LI), TTI(TTI), DL(DL) {}
+
+  bool run(Loop *L);
+
+private:
+  /// \name Countable Loop Idiom Handling
+  /// @{
+
+  bool runOnCountableLoop();
+  bool runOnLoopBlock(BasicBlock *BB, const SCEV *BECount,
+  SmallVectorImpl &ExitBlocks);
+
+  bool recognizeByteCompare();
+  Value *expandFindMismatch(IRBuilder<> &Builder, GetElementPtrInst *GEPA,
+GetElementPtrInst *GEPB, Instruction *Index,
+Value *Start, Value *MaxLen);
+  void transformByteCompare(GetElementPtrInst *GEPA, GetElementPtrInst *GEPB,
+PHINode *IndPhi, Value *MaxLen, Instruction *Index,
+Value *Start, bool IncIdx, BasicBlock *FoundBB,
+BasicBlock *EndBB);
+  /// @}
+};
+
+class AArch64LoopIdiomTransformLegacyPass : public LoopPass {
+public:
+  static char ID;
+
+  explicit AArch64LoopIdiomTransformLegacyPass() : LoopPass(ID) {
+initializeAArch64LoopIdiomTransformLegacyPassPass(
+*PassRegistry::getPassRegistry());
+  }
+
+  StringRef getPassName() const override {
+return "Transform AArch64-specific loop idioms";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+AU.addRequired();
+AU.addRequired();
+AU.addRequired();
+  }
+
+  bool runOnLoop(Loop *L, LPPassManager &LPM) override;
+};
+
+bool AArch64LoopIdiomTransformLegacyPass::runOnLoop(Loop *L,
+LPPassManager &LPM) {
+
+  if (skipLoop(L))
+return false;
+
+  auto *DT = &getAnalysis().getDomTree();
+  auto *LI = &getAnalysis().getLoopInfo();
+  auto &TTI = getAnalysis().getTTI(
+  *L->getHeader()->getParent());
+  return AArch64LoopIdiomTransform(
+ DT, LI, &TTI, &L->getHeader()->getModule()->getDataLayout

[clang] [llvm] [clang-tools-extra] [AArch64] Add an AArch64 pass for loop idiom transformations (PR #72273)

2023-12-22 Thread David Sherwood via cfe-commits


@@ -0,0 +1,816 @@
+//===- AArch64LoopIdiomTransform.cpp - Loop idiom recognition 
-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+//
+// This pass implements a pass that recognizes certain loop idioms and
+// transforms them into more optimized versions of the same loop. In cases
+// where this happens, it can be a significant performance win.
+//
+// We currently only recognize one loop that finds the first mismatched byte
+// in an array and returns the index, i.e. something like:
+//
+//  while (++i != n) {
+//if (a[i] != b[i])
+//  break;
+//  }
+//
+// In this example we can actually vectorize the loop despite the early exit,
+// although the loop vectorizer does not support it. It requires some extra
+// checks to deal with the possibility of faulting loads when crossing page
+// boundaries. However, even with these checks it is still profitable to do the
+// transformation.
+//
+//===--===//
+//
+// TODO List:
+//
+// * When optimizing for code size we may want to avoid some transformations.
+// * We can also support the inverse case where we scan for a matching element.
+//
+//===--===//
+
+#include "AArch64LoopIdiomTransform.h"
+#include "llvm/Analysis/DomTreeUpdater.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-loop-idiom-transform"
+
+static cl::opt
+DisableAll("disable-aarch64-lit-all", cl::Hidden, cl::init(false),
+   cl::desc("Disable AArch64 Loop Idiom Transform Pass."));
+
+static cl::opt DisableByteCmp(
+"disable-aarch64-lit-bytecmp", cl::Hidden, cl::init(false),
+cl::desc("Proceed with AArch64 Loop Idiom Transform Pass, but do "
+ "not convert byte-compare loop(s)."));
+
+static cl::opt VerifyLoops(
+"aarch64-lit-verify", cl::Hidden, cl::init(false),
+cl::desc("Verify loops generated AArch64 Loop Idiom Transform Pass."));
+
+namespace llvm {
+
+void initializeAArch64LoopIdiomTransformLegacyPassPass(PassRegistry &);
+Pass *createAArch64LoopIdiomTransformPass();
+
+} // end namespace llvm
+
+namespace {
+
+class AArch64LoopIdiomTransform {
+  Loop *CurLoop = nullptr;
+  DominatorTree *DT;
+  LoopInfo *LI;
+  const TargetTransformInfo *TTI;
+  const DataLayout *DL;
+
+public:
+  explicit AArch64LoopIdiomTransform(DominatorTree *DT, LoopInfo *LI,
+ const TargetTransformInfo *TTI,
+ const DataLayout *DL)
+  : DT(DT), LI(LI), TTI(TTI), DL(DL) {}
+
+  bool run(Loop *L);
+
+private:
+  /// \name Countable Loop Idiom Handling
+  /// @{
+
+  bool runOnCountableLoop();
+  bool runOnLoopBlock(BasicBlock *BB, const SCEV *BECount,
+  SmallVectorImpl &ExitBlocks);
+
+  bool recognizeByteCompare();
+  Value *expandFindMismatch(IRBuilder<> &Builder, GetElementPtrInst *GEPA,
+GetElementPtrInst *GEPB, Instruction *Index,
+Value *Start, Value *MaxLen);
+  void transformByteCompare(GetElementPtrInst *GEPA, GetElementPtrInst *GEPB,
+PHINode *IndPhi, Value *MaxLen, Instruction *Index,
+Value *Start, bool IncIdx, BasicBlock *FoundBB,
+BasicBlock *EndBB);
+  /// @}
+};
+
+class AArch64LoopIdiomTransformLegacyPass : public LoopPass {
+public:
+  static char ID;
+
+  explicit AArch64LoopIdiomTransformLegacyPass() : LoopPass(ID) {
+initializeAArch64LoopIdiomTransformLegacyPassPass(
+*PassRegistry::getPassRegistry());
+  }
+
+  StringRef getPassName() const override {
+return "Transform AArch64-specific loop idioms";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+AU.addRequired();
+AU.addRequired();
+AU.addRequired();
+  }
+
+  bool runOnLoop(Loop *L, LPPassManager &LPM) override;
+};
+
+bool AArch64LoopIdiomTransformLegacyPass::runOnLoop(Loop *L,
+LPPassManager &LPM) {
+
+  if (skipLoop(L))
+return false;
+
+  auto *DT = &getAnalysis().getDomTree();
+  auto *LI = &getAnalysis().getLoopInfo();
+  auto &TTI = getAnalysis().getTTI(
+  *L->getHeader()->getParent());
+  return AArch64LoopIdiomTransform(
+ DT, LI, &TTI, &L->getHeader()->getModule()->getDataLayout

[clang-tools-extra] [llvm] [clang] [AArch64] Add an AArch64 pass for loop idiom transformations (PR #72273)

2023-12-22 Thread David Sherwood via cfe-commits


@@ -0,0 +1,816 @@
+//===- AArch64LoopIdiomTransform.cpp - Loop idiom recognition 
-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+//
+// This pass implements a pass that recognizes certain loop idioms and
+// transforms them into more optimized versions of the same loop. In cases
+// where this happens, it can be a significant performance win.
+//
+// We currently only recognize one loop that finds the first mismatched byte
+// in an array and returns the index, i.e. something like:
+//
+//  while (++i != n) {
+//if (a[i] != b[i])
+//  break;
+//  }
+//
+// In this example we can actually vectorize the loop despite the early exit,
+// although the loop vectorizer does not support it. It requires some extra
+// checks to deal with the possibility of faulting loads when crossing page
+// boundaries. However, even with these checks it is still profitable to do the
+// transformation.
+//
+//===--===//
+//
+// TODO List:
+//
+// * When optimizing for code size we may want to avoid some transformations.
+// * We can also support the inverse case where we scan for a matching element.
+//
+//===--===//
+
+#include "AArch64LoopIdiomTransform.h"
+#include "llvm/Analysis/DomTreeUpdater.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-loop-idiom-transform"
+
+static cl::opt
+DisableAll("disable-aarch64-lit-all", cl::Hidden, cl::init(false),
+   cl::desc("Disable AArch64 Loop Idiom Transform Pass."));
+
+static cl::opt DisableByteCmp(
+"disable-aarch64-lit-bytecmp", cl::Hidden, cl::init(false),
+cl::desc("Proceed with AArch64 Loop Idiom Transform Pass, but do "
+ "not convert byte-compare loop(s)."));
+
+static cl::opt VerifyLoops(
+"aarch64-lit-verify", cl::Hidden, cl::init(false),
+cl::desc("Verify loops generated AArch64 Loop Idiom Transform Pass."));
+
+namespace llvm {
+
+void initializeAArch64LoopIdiomTransformLegacyPassPass(PassRegistry &);
+Pass *createAArch64LoopIdiomTransformPass();
+
+} // end namespace llvm
+
+namespace {
+
+class AArch64LoopIdiomTransform {
+  Loop *CurLoop = nullptr;
+  DominatorTree *DT;
+  LoopInfo *LI;
+  const TargetTransformInfo *TTI;
+  const DataLayout *DL;
+
+public:
+  explicit AArch64LoopIdiomTransform(DominatorTree *DT, LoopInfo *LI,
+ const TargetTransformInfo *TTI,
+ const DataLayout *DL)
+  : DT(DT), LI(LI), TTI(TTI), DL(DL) {}
+
+  bool run(Loop *L);
+
+private:
+  /// \name Countable Loop Idiom Handling
+  /// @{
+
+  bool runOnCountableLoop();
+  bool runOnLoopBlock(BasicBlock *BB, const SCEV *BECount,
+  SmallVectorImpl &ExitBlocks);
+
+  bool recognizeByteCompare();
+  Value *expandFindMismatch(IRBuilder<> &Builder, GetElementPtrInst *GEPA,
+GetElementPtrInst *GEPB, Instruction *Index,
+Value *Start, Value *MaxLen);
+  void transformByteCompare(GetElementPtrInst *GEPA, GetElementPtrInst *GEPB,
+PHINode *IndPhi, Value *MaxLen, Instruction *Index,
+Value *Start, bool IncIdx, BasicBlock *FoundBB,
+BasicBlock *EndBB);
+  /// @}
+};
+
+class AArch64LoopIdiomTransformLegacyPass : public LoopPass {
+public:
+  static char ID;
+
+  explicit AArch64LoopIdiomTransformLegacyPass() : LoopPass(ID) {
+initializeAArch64LoopIdiomTransformLegacyPassPass(
+*PassRegistry::getPassRegistry());
+  }
+
+  StringRef getPassName() const override {
+return "Transform AArch64-specific loop idioms";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+AU.addRequired();
+AU.addRequired();
+AU.addRequired();
+  }
+
+  bool runOnLoop(Loop *L, LPPassManager &LPM) override;
+};
+
+bool AArch64LoopIdiomTransformLegacyPass::runOnLoop(Loop *L,
+LPPassManager &LPM) {
+
+  if (skipLoop(L))
+return false;
+
+  auto *DT = &getAnalysis().getDomTree();
+  auto *LI = &getAnalysis().getLoopInfo();
+  auto &TTI = getAnalysis().getTTI(
+  *L->getHeader()->getParent());
+  return AArch64LoopIdiomTransform(
+ DT, LI, &TTI, &L->getHeader()->getModule()->getDataLayout

[clang-tools-extra] [llvm] [clang] [AArch64] Add an AArch64 pass for loop idiom transformations (PR #72273)

2023-12-22 Thread David Sherwood via cfe-commits


@@ -0,0 +1,816 @@
+//===- AArch64LoopIdiomTransform.cpp - Loop idiom recognition 
-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+//
+// This pass implements a pass that recognizes certain loop idioms and
+// transforms them into more optimized versions of the same loop. In cases
+// where this happens, it can be a significant performance win.
+//
+// We currently only recognize one loop that finds the first mismatched byte
+// in an array and returns the index, i.e. something like:
+//
+//  while (++i != n) {
+//if (a[i] != b[i])
+//  break;
+//  }
+//
+// In this example we can actually vectorize the loop despite the early exit,
+// although the loop vectorizer does not support it. It requires some extra
+// checks to deal with the possibility of faulting loads when crossing page
+// boundaries. However, even with these checks it is still profitable to do the
+// transformation.
+//
+//===--===//
+//
+// TODO List:
+//
+// * When optimizing for code size we may want to avoid some transformations.
+// * We can also support the inverse case where we scan for a matching element.
+//
+//===--===//
+
+#include "AArch64LoopIdiomTransform.h"
+#include "llvm/Analysis/DomTreeUpdater.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-loop-idiom-transform"
+
+static cl::opt
+DisableAll("disable-aarch64-lit-all", cl::Hidden, cl::init(false),
+   cl::desc("Disable AArch64 Loop Idiom Transform Pass."));
+
+static cl::opt DisableByteCmp(
+"disable-aarch64-lit-bytecmp", cl::Hidden, cl::init(false),
+cl::desc("Proceed with AArch64 Loop Idiom Transform Pass, but do "
+ "not convert byte-compare loop(s)."));
+
+static cl::opt VerifyLoops(
+"aarch64-lit-verify", cl::Hidden, cl::init(false),
+cl::desc("Verify loops generated AArch64 Loop Idiom Transform Pass."));
+
+namespace llvm {
+
+void initializeAArch64LoopIdiomTransformLegacyPassPass(PassRegistry &);
+Pass *createAArch64LoopIdiomTransformPass();
+
+} // end namespace llvm
+
+namespace {
+
+class AArch64LoopIdiomTransform {
+  Loop *CurLoop = nullptr;
+  DominatorTree *DT;
+  LoopInfo *LI;
+  const TargetTransformInfo *TTI;
+  const DataLayout *DL;
+
+public:
+  explicit AArch64LoopIdiomTransform(DominatorTree *DT, LoopInfo *LI,
+ const TargetTransformInfo *TTI,
+ const DataLayout *DL)
+  : DT(DT), LI(LI), TTI(TTI), DL(DL) {}
+
+  bool run(Loop *L);
+
+private:
+  /// \name Countable Loop Idiom Handling
+  /// @{
+
+  bool runOnCountableLoop();
+  bool runOnLoopBlock(BasicBlock *BB, const SCEV *BECount,
+  SmallVectorImpl &ExitBlocks);
+
+  bool recognizeByteCompare();
+  Value *expandFindMismatch(IRBuilder<> &Builder, GetElementPtrInst *GEPA,
+GetElementPtrInst *GEPB, Instruction *Index,
+Value *Start, Value *MaxLen);
+  void transformByteCompare(GetElementPtrInst *GEPA, GetElementPtrInst *GEPB,
+PHINode *IndPhi, Value *MaxLen, Instruction *Index,
+Value *Start, bool IncIdx, BasicBlock *FoundBB,
+BasicBlock *EndBB);
+  /// @}
+};
+
+class AArch64LoopIdiomTransformLegacyPass : public LoopPass {
+public:
+  static char ID;
+
+  explicit AArch64LoopIdiomTransformLegacyPass() : LoopPass(ID) {
+initializeAArch64LoopIdiomTransformLegacyPassPass(
+*PassRegistry::getPassRegistry());
+  }
+
+  StringRef getPassName() const override {
+return "Transform AArch64-specific loop idioms";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+AU.addRequired();
+AU.addRequired();
+AU.addRequired();
+  }
+
+  bool runOnLoop(Loop *L, LPPassManager &LPM) override;
+};
+
+bool AArch64LoopIdiomTransformLegacyPass::runOnLoop(Loop *L,
+LPPassManager &LPM) {
+
+  if (skipLoop(L))
+return false;
+
+  auto *DT = &getAnalysis().getDomTree();
+  auto *LI = &getAnalysis().getLoopInfo();
+  auto &TTI = getAnalysis().getTTI(
+  *L->getHeader()->getParent());
+  return AArch64LoopIdiomTransform(
+ DT, LI, &TTI, &L->getHeader()->getModule()->getDataLayout

[llvm] [clang] [clang-tools-extra] [AArch64] Add an AArch64 pass for loop idiom transformations (PR #72273)

2023-12-22 Thread David Sherwood via cfe-commits


@@ -0,0 +1,816 @@
+//===- AArch64LoopIdiomTransform.cpp - Loop idiom recognition 
-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+//
+// This pass implements a pass that recognizes certain loop idioms and
+// transforms them into more optimized versions of the same loop. In cases
+// where this happens, it can be a significant performance win.
+//
+// We currently only recognize one loop that finds the first mismatched byte
+// in an array and returns the index, i.e. something like:
+//
+//  while (++i != n) {
+//if (a[i] != b[i])
+//  break;
+//  }
+//
+// In this example we can actually vectorize the loop despite the early exit,
+// although the loop vectorizer does not support it. It requires some extra
+// checks to deal with the possibility of faulting loads when crossing page
+// boundaries. However, even with these checks it is still profitable to do the
+// transformation.
+//
+//===--===//
+//
+// TODO List:
+//
+// * When optimizing for code size we may want to avoid some transformations.
+// * We can also support the inverse case where we scan for a matching element.
+//
+//===--===//
+
+#include "AArch64LoopIdiomTransform.h"
+#include "llvm/Analysis/DomTreeUpdater.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-loop-idiom-transform"
+
+static cl::opt
+DisableAll("disable-aarch64-lit-all", cl::Hidden, cl::init(false),
+   cl::desc("Disable AArch64 Loop Idiom Transform Pass."));
+
+static cl::opt DisableByteCmp(
+"disable-aarch64-lit-bytecmp", cl::Hidden, cl::init(false),
+cl::desc("Proceed with AArch64 Loop Idiom Transform Pass, but do "
+ "not convert byte-compare loop(s)."));
+
+static cl::opt VerifyLoops(
+"aarch64-lit-verify", cl::Hidden, cl::init(false),
+cl::desc("Verify loops generated AArch64 Loop Idiom Transform Pass."));
+
+namespace llvm {
+
+void initializeAArch64LoopIdiomTransformLegacyPassPass(PassRegistry &);
+Pass *createAArch64LoopIdiomTransformPass();
+
+} // end namespace llvm
+
+namespace {
+
+class AArch64LoopIdiomTransform {
+  Loop *CurLoop = nullptr;
+  DominatorTree *DT;
+  LoopInfo *LI;
+  const TargetTransformInfo *TTI;
+  const DataLayout *DL;
+
+public:
+  explicit AArch64LoopIdiomTransform(DominatorTree *DT, LoopInfo *LI,
+ const TargetTransformInfo *TTI,
+ const DataLayout *DL)
+  : DT(DT), LI(LI), TTI(TTI), DL(DL) {}
+
+  bool run(Loop *L);
+
+private:
+  /// \name Countable Loop Idiom Handling
+  /// @{
+
+  bool runOnCountableLoop();
+  bool runOnLoopBlock(BasicBlock *BB, const SCEV *BECount,
+  SmallVectorImpl &ExitBlocks);
+
+  bool recognizeByteCompare();
+  Value *expandFindMismatch(IRBuilder<> &Builder, GetElementPtrInst *GEPA,
+GetElementPtrInst *GEPB, Instruction *Index,
+Value *Start, Value *MaxLen);
+  void transformByteCompare(GetElementPtrInst *GEPA, GetElementPtrInst *GEPB,
+PHINode *IndPhi, Value *MaxLen, Instruction *Index,
+Value *Start, bool IncIdx, BasicBlock *FoundBB,
+BasicBlock *EndBB);
+  /// @}
+};
+
+class AArch64LoopIdiomTransformLegacyPass : public LoopPass {
+public:
+  static char ID;
+
+  explicit AArch64LoopIdiomTransformLegacyPass() : LoopPass(ID) {
+initializeAArch64LoopIdiomTransformLegacyPassPass(
+*PassRegistry::getPassRegistry());
+  }
+
+  StringRef getPassName() const override {
+return "Transform AArch64-specific loop idioms";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+AU.addRequired();
+AU.addRequired();
+AU.addRequired();
+  }
+
+  bool runOnLoop(Loop *L, LPPassManager &LPM) override;
+};
+
+bool AArch64LoopIdiomTransformLegacyPass::runOnLoop(Loop *L,
+LPPassManager &LPM) {
+
+  if (skipLoop(L))
+return false;
+
+  auto *DT = &getAnalysis().getDomTree();
+  auto *LI = &getAnalysis().getLoopInfo();
+  auto &TTI = getAnalysis().getTTI(
+  *L->getHeader()->getParent());
+  return AArch64LoopIdiomTransform(
+ DT, LI, &TTI, &L->getHeader()->getModule()->getDataLayout

[clang-tools-extra] [clang] [llvm] [LoopVectorize] Refine runtime memory check costs when there is an outer loop (PR #76034)

2024-01-08 Thread David Sherwood via cfe-commits

https://github.com/david-arm updated 
https://github.com/llvm/llvm-project/pull/76034

>From a4caa47dc8d2db75f6bb2ac3f880da4e1f6bea82 Mon Sep 17 00:00:00 2001
From: David Sherwood 
Date: Tue, 19 Dec 2023 16:07:33 +
Subject: [PATCH 1/2] Add tests showing runtime checks cost with low trip
 counts

---
 .../AArch64/low_trip_memcheck_cost.ll | 187 ++
 1 file changed, 187 insertions(+)
 create mode 100644 
llvm/test/Transforms/LoopVectorize/AArch64/low_trip_memcheck_cost.ll

diff --git 
a/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_memcheck_cost.ll 
b/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_memcheck_cost.ll
new file mode 100644
index 00..397521c2d3dc8f
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_memcheck_cost.ll
@@ -0,0 +1,187 @@
+; REQUIRES: asserts
+; RUN: opt -p loop-vectorize -debug-only=loop-vectorize -S -disable-output < 
%s 2>&1 | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+
+define void @outer_no_tc(ptr nocapture noundef %a, ptr nocapture noundef 
readonly %b, i64 noundef %m, i64 noundef %n) {
+; CHECK-LABEL: LV: Checking a loop in 'outer_no_tc'
+; CHECK:  Calculating cost of runtime checks:
+; CHECK:  Total cost of runtime checks: 6
+; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16
+entry:
+  br label %outer.loop
+
+outer.loop:
+  %outer.iv = phi i64 [ %outer.iv.next, %inner.exit ], [ 0, %entry ]
+  %mul.us = mul nsw i64 %outer.iv, %n
+  br label %inner.loop
+
+inner.loop:
+  %inner.iv = phi i64 [ 0, %outer.loop ], [ %inner.iv.next, %inner.loop ]
+  %add.us = add nuw nsw i64 %inner.iv, %mul.us
+  %arrayidx.us = getelementptr inbounds i8, ptr %b, i64 %add.us
+  %0 = load i8, ptr %arrayidx.us, align 1
+  %arrayidx7.us = getelementptr inbounds i8, ptr %a, i64 %add.us
+  %1 = load i8, ptr %arrayidx7.us, align 1
+  %add9.us = add i8 %1, %0
+  store i8 %add9.us, ptr %arrayidx7.us, align 1
+  %inner.iv.next = add nuw nsw i64 %inner.iv, 1
+  %exitcond.not = icmp eq i64 %inner.iv.next, %n
+  br i1 %exitcond.not, label %inner.exit, label %inner.loop
+
+inner.exit:
+  %outer.iv.next = add nuw nsw i64 %outer.iv, 1
+  %exitcond27.not = icmp eq i64 %outer.iv.next, %m
+  br i1 %exitcond27.not, label %outer.exit, label %outer.loop
+
+outer.exit:
+  ret void
+}
+
+
+define void @outer_known_tc3(ptr nocapture noundef %a, ptr nocapture noundef 
readonly %b, i64 noundef %n) {
+; CHECK-LABEL: LV: Checking a loop in 'outer_known_tc3'
+; CHECK:  Calculating cost of runtime checks:
+; CHECK:  Total cost of runtime checks: 6
+; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16
+entry:
+  br label %outer.loop
+
+outer.loop:
+  %outer.iv = phi i64 [ %outer.iv.next, %inner.exit ], [ 0, %entry ]
+  %mul.us = mul nsw i64 %outer.iv, %n
+  br label %inner.loop
+
+inner.loop:
+  %inner.iv = phi i64 [ 0, %outer.loop ], [ %inner.iv.next, %inner.loop ]
+  %add.us = add nuw nsw i64 %inner.iv, %mul.us
+  %arrayidx.us = getelementptr inbounds i8, ptr %b, i64 %add.us
+  %0 = load i8, ptr %arrayidx.us, align 1
+  %arrayidx7.us = getelementptr inbounds i8, ptr %a, i64 %add.us
+  %1 = load i8, ptr %arrayidx7.us, align 1
+  %add9.us = add i8 %1, %0
+  store i8 %add9.us, ptr %arrayidx7.us, align 1
+  %inner.iv.next = add nuw nsw i64 %inner.iv, 1
+  %exitcond.not = icmp eq i64 %inner.iv.next, %n
+  br i1 %exitcond.not, label %inner.exit, label %inner.loop
+
+inner.exit:
+  %outer.iv.next = add nuw nsw i64 %outer.iv, 1
+  %exitcond26.not = icmp eq i64 %outer.iv.next, 3
+  br i1 %exitcond26.not, label %outer.exit, label %outer.loop
+
+outer.exit:
+  ret void
+}
+
+
+define void @outer_known_tc64(ptr nocapture noundef %a, ptr nocapture noundef 
readonly %b, i64 noundef %n) {
+; CHECK-LABEL: LV: Checking a loop in 'outer_known_tc64'
+; CHECK:  Calculating cost of runtime checks:
+; CHECK:  Total cost of runtime checks: 6
+; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16
+entry:
+  br label %outer.loop
+
+outer.loop:
+  %outer.iv = phi i64 [ %outer.iv.next, %inner.exit ], [ 0, %entry ]
+  %mul.us = mul nsw i64 %outer.iv, %n
+  br label %inner.loop
+
+inner.loop:
+  %inner.iv = phi i64 [ 0, %outer.loop ], [ %inner.iv.next, %inner.loop ]
+  %add.us = add nuw nsw i64 %inner.iv, %mul.us
+  %arrayidx.us = getelementptr inbounds i8, ptr %b, i64 %add.us
+  %0 = load i8, ptr %arrayidx.us, align 1
+  %arrayidx7.us = getelementptr inbounds i8, ptr %a, i64 %add.us
+  %1 = load i8, ptr %arrayidx7.us, align 1
+  %add9.us = add i8 %1, %0
+  store i8 %add9.us, ptr %arrayidx7.us, align 1
+  %inner.iv.next = add nuw nsw i64 %inner.iv, 1
+  %exitcond.not = icmp eq i64 %inner.iv.next, %n
+  br i1 %exitcond.not, label %inner.exit, label %inner.loop
+
+inner.exit:
+  %outer.iv.next = add nuw nsw i64 %outer.iv, 1
+  %exitcond26.not = icmp eq i64 %outer.iv.next, 64
+  br i1 %exitcond26.not, label %outer.exit, label %outer.loop
+
+outer.exit:
+  ret void
+}
+
+
+defi

[clang] [llvm] [clang-tools-extra] [LoopVectorize] Refine runtime memory check costs when there is an outer loop (PR #76034)

2024-01-08 Thread David Sherwood via cfe-commits

https://github.com/david-arm updated 
https://github.com/llvm/llvm-project/pull/76034

>From a4caa47dc8d2db75f6bb2ac3f880da4e1f6bea82 Mon Sep 17 00:00:00 2001
From: David Sherwood 
Date: Tue, 19 Dec 2023 16:07:33 +
Subject: [PATCH 1/6] Add tests showing runtime checks cost with low trip
 counts

---
 .../AArch64/low_trip_memcheck_cost.ll | 187 ++
 1 file changed, 187 insertions(+)
 create mode 100644 
llvm/test/Transforms/LoopVectorize/AArch64/low_trip_memcheck_cost.ll

diff --git 
a/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_memcheck_cost.ll 
b/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_memcheck_cost.ll
new file mode 100644
index 00..397521c2d3dc8f
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_memcheck_cost.ll
@@ -0,0 +1,187 @@
+; REQUIRES: asserts
+; RUN: opt -p loop-vectorize -debug-only=loop-vectorize -S -disable-output < 
%s 2>&1 | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+
+define void @outer_no_tc(ptr nocapture noundef %a, ptr nocapture noundef 
readonly %b, i64 noundef %m, i64 noundef %n) {
+; CHECK-LABEL: LV: Checking a loop in 'outer_no_tc'
+; CHECK:  Calculating cost of runtime checks:
+; CHECK:  Total cost of runtime checks: 6
+; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16
+entry:
+  br label %outer.loop
+
+outer.loop:
+  %outer.iv = phi i64 [ %outer.iv.next, %inner.exit ], [ 0, %entry ]
+  %mul.us = mul nsw i64 %outer.iv, %n
+  br label %inner.loop
+
+inner.loop:
+  %inner.iv = phi i64 [ 0, %outer.loop ], [ %inner.iv.next, %inner.loop ]
+  %add.us = add nuw nsw i64 %inner.iv, %mul.us
+  %arrayidx.us = getelementptr inbounds i8, ptr %b, i64 %add.us
+  %0 = load i8, ptr %arrayidx.us, align 1
+  %arrayidx7.us = getelementptr inbounds i8, ptr %a, i64 %add.us
+  %1 = load i8, ptr %arrayidx7.us, align 1
+  %add9.us = add i8 %1, %0
+  store i8 %add9.us, ptr %arrayidx7.us, align 1
+  %inner.iv.next = add nuw nsw i64 %inner.iv, 1
+  %exitcond.not = icmp eq i64 %inner.iv.next, %n
+  br i1 %exitcond.not, label %inner.exit, label %inner.loop
+
+inner.exit:
+  %outer.iv.next = add nuw nsw i64 %outer.iv, 1
+  %exitcond27.not = icmp eq i64 %outer.iv.next, %m
+  br i1 %exitcond27.not, label %outer.exit, label %outer.loop
+
+outer.exit:
+  ret void
+}
+
+
+define void @outer_known_tc3(ptr nocapture noundef %a, ptr nocapture noundef 
readonly %b, i64 noundef %n) {
+; CHECK-LABEL: LV: Checking a loop in 'outer_known_tc3'
+; CHECK:  Calculating cost of runtime checks:
+; CHECK:  Total cost of runtime checks: 6
+; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16
+entry:
+  br label %outer.loop
+
+outer.loop:
+  %outer.iv = phi i64 [ %outer.iv.next, %inner.exit ], [ 0, %entry ]
+  %mul.us = mul nsw i64 %outer.iv, %n
+  br label %inner.loop
+
+inner.loop:
+  %inner.iv = phi i64 [ 0, %outer.loop ], [ %inner.iv.next, %inner.loop ]
+  %add.us = add nuw nsw i64 %inner.iv, %mul.us
+  %arrayidx.us = getelementptr inbounds i8, ptr %b, i64 %add.us
+  %0 = load i8, ptr %arrayidx.us, align 1
+  %arrayidx7.us = getelementptr inbounds i8, ptr %a, i64 %add.us
+  %1 = load i8, ptr %arrayidx7.us, align 1
+  %add9.us = add i8 %1, %0
+  store i8 %add9.us, ptr %arrayidx7.us, align 1
+  %inner.iv.next = add nuw nsw i64 %inner.iv, 1
+  %exitcond.not = icmp eq i64 %inner.iv.next, %n
+  br i1 %exitcond.not, label %inner.exit, label %inner.loop
+
+inner.exit:
+  %outer.iv.next = add nuw nsw i64 %outer.iv, 1
+  %exitcond26.not = icmp eq i64 %outer.iv.next, 3
+  br i1 %exitcond26.not, label %outer.exit, label %outer.loop
+
+outer.exit:
+  ret void
+}
+
+
+define void @outer_known_tc64(ptr nocapture noundef %a, ptr nocapture noundef 
readonly %b, i64 noundef %n) {
+; CHECK-LABEL: LV: Checking a loop in 'outer_known_tc64'
+; CHECK:  Calculating cost of runtime checks:
+; CHECK:  Total cost of runtime checks: 6
+; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16
+entry:
+  br label %outer.loop
+
+outer.loop:
+  %outer.iv = phi i64 [ %outer.iv.next, %inner.exit ], [ 0, %entry ]
+  %mul.us = mul nsw i64 %outer.iv, %n
+  br label %inner.loop
+
+inner.loop:
+  %inner.iv = phi i64 [ 0, %outer.loop ], [ %inner.iv.next, %inner.loop ]
+  %add.us = add nuw nsw i64 %inner.iv, %mul.us
+  %arrayidx.us = getelementptr inbounds i8, ptr %b, i64 %add.us
+  %0 = load i8, ptr %arrayidx.us, align 1
+  %arrayidx7.us = getelementptr inbounds i8, ptr %a, i64 %add.us
+  %1 = load i8, ptr %arrayidx7.us, align 1
+  %add9.us = add i8 %1, %0
+  store i8 %add9.us, ptr %arrayidx7.us, align 1
+  %inner.iv.next = add nuw nsw i64 %inner.iv, 1
+  %exitcond.not = icmp eq i64 %inner.iv.next, %n
+  br i1 %exitcond.not, label %inner.exit, label %inner.loop
+
+inner.exit:
+  %outer.iv.next = add nuw nsw i64 %outer.iv, 1
+  %exitcond26.not = icmp eq i64 %outer.iv.next, 64
+  br i1 %exitcond26.not, label %outer.exit, label %outer.loop
+
+outer.exit:
+  ret void
+}
+
+
+defi

[llvm] [clang] [clang-tools-extra] [AArch64] Add an AArch64 pass for loop idiom transformations (PR #72273)

2024-01-09 Thread David Sherwood via cfe-commits

https://github.com/david-arm closed 
https://github.com/llvm/llvm-project/pull/72273
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [clang-tools-extra] [llvm] [AArch64] Add an AArch64 pass for loop idiom transformations (PR #72273)

2024-01-09 Thread David Sherwood via cfe-commits

david-arm wrote:

Hi @dyung, sorry about this! It passed for me locally. It sounds like it needs 
a REQUIRED aarch64-target somewhere then.

I'll try to fix it asap.


https://github.com/llvm/llvm-project/pull/72273
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang-tools-extra] [llvm] [clang] [AArch64] Add an AArch64 pass for loop idiom transformations (PR #72273)

2024-01-09 Thread David Sherwood via cfe-commits

david-arm wrote:

@dyung - fix pending here https://github.com/llvm/llvm-project/pull/77467

https://github.com/llvm/llvm-project/pull/72273
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [clang-tools-extra] [llvm] [AArch64] Add an AArch64 pass for loop idiom transformations (PR #72273)

2024-01-09 Thread David Sherwood via cfe-commits

david-arm wrote:

@dyung - fix pending here https://github.com/llvm/llvm-project/pull/77467

https://github.com/llvm/llvm-project/pull/72273
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[llvm] [clang] [AArch64][SME] Fix multi vector cvt builtins (PR #77656)

2024-01-11 Thread David Sherwood via cfe-commits


@@ -34,118 +34,118 @@ define  
@multi_vector_cvt_x2_bf16( %unu
 ;
 ; FCVTZS
 ;
-define {, }  
@multi_vector_cvt_x2_f32_s32( %unused,  
%zn0,  %zn1) {
-; CHECK-LABEL: multi_vector_cvt_x2_f32_s32:
+define {, }  
@multi_vector_cvt_x2_s32_f32( %unused,  
%zn0,  %zn1) {
+; CHECK-LABEL: multi_vector_cvt_x2_s32_f32:
 ; CHECK:   // %bb.0:
 ; CHECK-NEXT:mov z3.d, z2.d
 ; CHECK-NEXT:mov z2.d, z1.d
 ; CHECK-NEXT:fcvtzs { z0.s, z1.s }, { z2.s, z3.s }
 ; CHECK-NEXT:ret
-  %res = call {, } 
@llvm.aarch64.sve.fcvts.x2.nxv4f32(%zn0, %zn1)
-  ret {, } %res
+  %res = call {, } 
@llvm.aarch64.sve.fcvts.x2.nxv4f32( %zn0,  %zn1)
+  ret {, } %res
 }
 
-define {, ,, 
}  @multi_vector_cvt_x4_f32_s32( %unused, 
 %zn0,  %zn1,  %zn2, 
 %zn3) {
-; CHECK-LABEL: multi_vector_cvt_x4_f32_s32:
+define {, ,, }  @multi_vector_cvt_x4_s32_f32( %unused,  %zn0,  %zn1,  %zn2,  %zn3) {
+; CHECK-LABEL: multi_vector_cvt_x4_s32_f32:
 ; CHECK:   // %bb.0:
 ; CHECK-NEXT:mov z7.d, z4.d
 ; CHECK-NEXT:mov z6.d, z3.d
 ; CHECK-NEXT:mov z5.d, z2.d
 ; CHECK-NEXT:mov z4.d, z1.d
 ; CHECK-NEXT:fcvtzs { z0.s - z3.s }, { z4.s - z7.s }
 ; CHECK-NEXT:ret
-  %res = call {, ,, } @llvm.aarch64.sve.fcvts.x4.nxv4f32(%zn0, %zn1, %zn2, %zn3)
-  ret {, , , 
} %res
+  %res = call {, ,, 
} @llvm.aarch64.sve.fcvts.x4.nxv4f32( 
%zn0,  %zn1,  %zn2,  %zn3)
+  ret {, , , } %res
 }
 
 ;
 ; FCVTZU
 ;
-define {, }  
@multi_vector_cvt_x2_f32_u32( %unused,  
%zn0,  %zn1) {
-; CHECK-LABEL: multi_vector_cvt_x2_f32_u32:
+define {, }  
@multi_vector_cvt_x2_u32_f32( %unused,  
%zn0,  %zn1) {
+; CHECK-LABEL: multi_vector_cvt_x2_u32_f32:
 ; CHECK:   // %bb.0:
 ; CHECK-NEXT:mov z3.d, z2.d
 ; CHECK-NEXT:mov z2.d, z1.d
 ; CHECK-NEXT:fcvtzu { z0.s, z1.s }, { z2.s, z3.s }
 ; CHECK-NEXT:ret
-  %res = call {, } 
@llvm.aarch64.sve.fcvtu.x2.nxv4f32(%zn0, %zn1)
-  ret {, } %res
+  %res = call {, } 
@llvm.aarch64.sve.fcvtu.x2.nxv4f32( %zn0,  %zn1)
+  ret {, } %res
 }
 
-define {, , , 
}  @multi_vector_cvt_x4_f32_u32( %unused, 
 %zn0,  %zn1,  %zn2, 
 %zn3) {
-; CHECK-LABEL: multi_vector_cvt_x4_f32_u32:
+define {, , , }  @multi_vector_cvt_x4_u32_f32( %unused,  %zn0,  %zn1,  %zn2,  %zn3) {
+; CHECK-LABEL: multi_vector_cvt_x4_u32_f32:
 ; CHECK:   // %bb.0:
 ; CHECK-NEXT:mov z7.d, z4.d
 ; CHECK-NEXT:mov z6.d, z3.d
 ; CHECK-NEXT:mov z5.d, z2.d
 ; CHECK-NEXT:mov z4.d, z1.d
 ; CHECK-NEXT:fcvtzu { z0.s - z3.s }, { z4.s - z7.s }
 ; CHECK-NEXT:ret
-  %res = call {, ,, } @llvm.aarch64.sve.fcvtu.x4.nxv4f32(%zn0, %zn1, %zn2, %zn3)
-  ret {, , , 
} %res
+  %res = call {, ,, 
} @llvm.aarch64.sve.fcvtu.x4.nxv4f32( 
%zn0,  %zn1,  %zn2,  %zn3)
+  ret {, , , } %res
 }
 
 ;
 ; SCVTF
 ;
-define {, }  
@multi_vector_cvt_x2_s32_f32(%unused,  
%zn0,  %zn1) {
-; CHECK-LABEL: multi_vector_cvt_x2_s32_f32:
+define {, }  
@multi_vector_cvt_x2_f32_s32( %unused,  
%zn0,  %zn1) {
+; CHECK-LABEL: multi_vector_cvt_x2_f32_s32:
 ; CHECK:   // %bb.0:
 ; CHECK-NEXT:mov z3.d, z2.d
 ; CHECK-NEXT:mov z2.d, z1.d
 ; CHECK-NEXT:scvtf { z0.s, z1.s }, { z2.s, z3.s }
 ; CHECK-NEXT:ret
-  %res = call {, } 
@llvm.aarch64.sve.scvtf.x2.nxv4f32(%zn0, %zn1)
-  ret {, } %res
+  %res = call {, } 
@llvm.aarch64.sve.scvtf.x2.nxv4i32( %zn0,  
%zn1)

david-arm wrote:

Shouldn't the intrinsic name be

`@llvm.aarch64.sve.scvtf.x2.nxv4f32`

because the intrinsics are all keyed off the floating point type, with bitcasts 
of the variable FP type to an integer type. I realise this does seem to work, 
but perhaps it's clearer to use the correct type.

https://github.com/llvm/llvm-project/pull/77656
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] ceb6c23 - [NFC][LoopVectorize] Explicitly disable tail-folding on some SVE tests

2022-07-21 Thread David Sherwood via cfe-commits

Author: David Sherwood
Date: 2022-07-21T15:23:00+01:00
New Revision: ceb6c23b708d4cae3fbb0a569c5ac14069524a63

URL: 
https://github.com/llvm/llvm-project/commit/ceb6c23b708d4cae3fbb0a569c5ac14069524a63
DIFF: 
https://github.com/llvm/llvm-project/commit/ceb6c23b708d4cae3fbb0a569c5ac14069524a63.diff

LOG: [NFC][LoopVectorize] Explicitly disable tail-folding on some SVE tests

This patch is in preparation for enabling vectorisation with tail-folding
by default for SVE targets. Once we do that many existing tests will
break that depend upon having normal unpredicated vector loops. For
all such tests I have added the flag:

  -prefer-predicate-over-epilogue=scalar-epilogue

Differential Revision: https://reviews.llvm.org/D129137

Added: 


Modified: 
clang/test/CodeGen/aarch64-sve-vector-bits-codegen.c

llvm/test/Transforms/LoopVectorize/AArch64/gather-do-not-vectorize-addressing.ll
llvm/test/Transforms/LoopVectorize/AArch64/i1-reg-usage.ll
llvm/test/Transforms/LoopVectorize/AArch64/scalable-call.ll
llvm/test/Transforms/LoopVectorize/AArch64/scalable-reduction-inloop-cond.ll
llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll
llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll

llvm/test/Transforms/LoopVectorize/AArch64/scalarize-store-with-predication.ll
llvm/test/Transforms/LoopVectorize/AArch64/sve-basic-vec.ll
llvm/test/Transforms/LoopVectorize/AArch64/sve-cond-inv-loads.ll
llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect.ll
llvm/test/Transforms/LoopVectorize/AArch64/sve-fneg.ll
llvm/test/Transforms/LoopVectorize/AArch64/sve-gather-scatter-cost.ll
llvm/test/Transforms/LoopVectorize/AArch64/sve-gather-scatter.ll
llvm/test/Transforms/LoopVectorize/AArch64/sve-illegal-type.ll
llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions-unusual-types.ll
llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions.ll
llvm/test/Transforms/LoopVectorize/AArch64/sve-inv-loads.ll
llvm/test/Transforms/LoopVectorize/AArch64/sve-inv-store.ll
llvm/test/Transforms/LoopVectorize/AArch64/sve-large-strides.ll
llvm/test/Transforms/LoopVectorize/AArch64/sve-masked-loadstore.ll

llvm/test/Transforms/LoopVectorize/AArch64/sve-runtime-check-size-based-threshold.ll
llvm/test/Transforms/LoopVectorize/AArch64/sve-select-cmp.ll
llvm/test/Transforms/LoopVectorize/AArch64/sve-strict-fadd-cost.ll
llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse-mask4.ll
llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse.ll
llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-gep.ll
llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll
llvm/test/Transforms/LoopVectorize/AArch64/vector-reverse-mask4.ll
llvm/test/Transforms/LoopVectorize/AArch64/vector-reverse.ll

Removed: 




diff  --git a/clang/test/CodeGen/aarch64-sve-vector-bits-codegen.c 
b/clang/test/CodeGen/aarch64-sve-vector-bits-codegen.c
index bccd328f0ccad..e306f44c27fb3 100644
--- a/clang/test/CodeGen/aarch64-sve-vector-bits-codegen.c
+++ b/clang/test/CodeGen/aarch64-sve-vector-bits-codegen.c
@@ -1,7 +1,11 @@
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve 
-fallow-half-arguments-and-returns -O2 -S -o - %s -mvscale-min=2 -mvscale-max=2 
 | FileCheck %s --check-prefixes=CHECK,CHECK256
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve 
-fallow-half-arguments-and-returns -O2 -S -o - %s -mvscale-min=4 -mvscale-max=4 
 | FileCheck %s --check-prefixes=CHECK,CHECK512
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve 
-fallow-half-arguments-and-returns -O2 -S -o - %s -mvscale-min=8 -mvscale-max=8 
| FileCheck %s --check-prefixes=CHECK,CHECK1024
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve 
-fallow-half-arguments-and-returns -O2 -S -o - %s -mvscale-min=16 
-mvscale-max=16 | FileCheck %s --check-prefixes=CHECK,CHECK2048
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve 
-fallow-half-arguments-and-returns -O2 -S \
+// RUN:   -mllvm -prefer-predicate-over-epilogue=scalar-epilogue -o - %s 
-mvscale-min=2 -mvscale-max=2  | FileCheck %s --check-prefixes=CHECK,CHECK256
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve 
-fallow-half-arguments-and-returns -O2 -S \
+// RUN:   -mllvm -prefer-predicate-over-epilogue=scalar-epilogue -o - %s 
-mvscale-min=4 -mvscale-max=4  | FileCheck %s --check-prefixes=CHECK,CHECK512
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve 
-fallow-half-arguments-and-returns -O2 -S \
+// RUN:   -mllvm -prefer-predicate-over-epilogue=scalar-epilogue -o - %s 
-mvscale-min=8 -mvscale-max=8 | FileCheck %s --check-prefixes=CHECK,CHECK1024
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve 
-fallow-half-arguments-and-returns -O2 -S \
+// RUN:   -m

[clang] [llvm] [clang-tools-extra] [AArch64] Add an AArch64 pass for loop idiom transformations (PR #72273)

2024-02-06 Thread David Sherwood via cfe-commits

david-arm wrote:

> Hi! I wonder that have you conducted any tests to determine the potential 
> performance increase of this pass in the SPEC2017 557xz benchmark? I 
> attempted to apply it to the xz benchmark, but only one copy(--copies=1) 
> demonstrated a significant increase(about 3%), but there was no increase when 
> I set --copies=128 or higher. Do you have any suggestions or test results 
> that you could share?

The most significant gains with xz have already been achieved when 
https://github.com/llvm/llvm-project/pull/77480 and 
https://github.com/llvm/llvm-project/pull/77480 landed, which improved 
performance by 6-7% for neoverse-v1. This PR is a NFC refactoring patch so it 
won't improve performance further. My follow-on patch (not yet posted) will 
trigger more cases in xz, but I don't expect any substantial performance gains 
for xz. The main goal of extending this pass further is to improve code 
coverage and testing, and hopefully there will be other applications besides xz 
that will benefit too.

https://github.com/llvm/llvm-project/pull/72273
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [LTO] Fix Veclib flags correctly pass to LTO flags (PR #78749)

2024-01-24 Thread David Sherwood via cfe-commits

https://github.com/david-arm approved this pull request.

LGTM as well!

https://github.com/llvm/llvm-project/pull/78749
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[llvm] [clang-tools-extra] [clang] [LoopVectorize] Refine runtime memory check costs when there is an outer loop (PR #76034)

2024-01-15 Thread David Sherwood via cfe-commits

david-arm wrote:

Gentle ping!

https://github.com/llvm/llvm-project/pull/76034
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang-tools-extra] [llvm] [clang] [LoopVectorize] Refine runtime memory check costs when there is an outer loop (PR #76034)

2024-01-18 Thread David Sherwood via cfe-commits

https://github.com/david-arm updated 
https://github.com/llvm/llvm-project/pull/76034

>From a4caa47dc8d2db75f6bb2ac3f880da4e1f6bea82 Mon Sep 17 00:00:00 2001
From: David Sherwood 
Date: Tue, 19 Dec 2023 16:07:33 +
Subject: [PATCH 1/6] Add tests showing runtime checks cost with low trip
 counts

---
 .../AArch64/low_trip_memcheck_cost.ll | 187 ++
 1 file changed, 187 insertions(+)
 create mode 100644 
llvm/test/Transforms/LoopVectorize/AArch64/low_trip_memcheck_cost.ll

diff --git 
a/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_memcheck_cost.ll 
b/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_memcheck_cost.ll
new file mode 100644
index 00..397521c2d3dc8f
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_memcheck_cost.ll
@@ -0,0 +1,187 @@
+; REQUIRES: asserts
+; RUN: opt -p loop-vectorize -debug-only=loop-vectorize -S -disable-output < 
%s 2>&1 | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+
+define void @outer_no_tc(ptr nocapture noundef %a, ptr nocapture noundef 
readonly %b, i64 noundef %m, i64 noundef %n) {
+; CHECK-LABEL: LV: Checking a loop in 'outer_no_tc'
+; CHECK:  Calculating cost of runtime checks:
+; CHECK:  Total cost of runtime checks: 6
+; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16
+entry:
+  br label %outer.loop
+
+outer.loop:
+  %outer.iv = phi i64 [ %outer.iv.next, %inner.exit ], [ 0, %entry ]
+  %mul.us = mul nsw i64 %outer.iv, %n
+  br label %inner.loop
+
+inner.loop:
+  %inner.iv = phi i64 [ 0, %outer.loop ], [ %inner.iv.next, %inner.loop ]
+  %add.us = add nuw nsw i64 %inner.iv, %mul.us
+  %arrayidx.us = getelementptr inbounds i8, ptr %b, i64 %add.us
+  %0 = load i8, ptr %arrayidx.us, align 1
+  %arrayidx7.us = getelementptr inbounds i8, ptr %a, i64 %add.us
+  %1 = load i8, ptr %arrayidx7.us, align 1
+  %add9.us = add i8 %1, %0
+  store i8 %add9.us, ptr %arrayidx7.us, align 1
+  %inner.iv.next = add nuw nsw i64 %inner.iv, 1
+  %exitcond.not = icmp eq i64 %inner.iv.next, %n
+  br i1 %exitcond.not, label %inner.exit, label %inner.loop
+
+inner.exit:
+  %outer.iv.next = add nuw nsw i64 %outer.iv, 1
+  %exitcond27.not = icmp eq i64 %outer.iv.next, %m
+  br i1 %exitcond27.not, label %outer.exit, label %outer.loop
+
+outer.exit:
+  ret void
+}
+
+
+define void @outer_known_tc3(ptr nocapture noundef %a, ptr nocapture noundef 
readonly %b, i64 noundef %n) {
+; CHECK-LABEL: LV: Checking a loop in 'outer_known_tc3'
+; CHECK:  Calculating cost of runtime checks:
+; CHECK:  Total cost of runtime checks: 6
+; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16
+entry:
+  br label %outer.loop
+
+outer.loop:
+  %outer.iv = phi i64 [ %outer.iv.next, %inner.exit ], [ 0, %entry ]
+  %mul.us = mul nsw i64 %outer.iv, %n
+  br label %inner.loop
+
+inner.loop:
+  %inner.iv = phi i64 [ 0, %outer.loop ], [ %inner.iv.next, %inner.loop ]
+  %add.us = add nuw nsw i64 %inner.iv, %mul.us
+  %arrayidx.us = getelementptr inbounds i8, ptr %b, i64 %add.us
+  %0 = load i8, ptr %arrayidx.us, align 1
+  %arrayidx7.us = getelementptr inbounds i8, ptr %a, i64 %add.us
+  %1 = load i8, ptr %arrayidx7.us, align 1
+  %add9.us = add i8 %1, %0
+  store i8 %add9.us, ptr %arrayidx7.us, align 1
+  %inner.iv.next = add nuw nsw i64 %inner.iv, 1
+  %exitcond.not = icmp eq i64 %inner.iv.next, %n
+  br i1 %exitcond.not, label %inner.exit, label %inner.loop
+
+inner.exit:
+  %outer.iv.next = add nuw nsw i64 %outer.iv, 1
+  %exitcond26.not = icmp eq i64 %outer.iv.next, 3
+  br i1 %exitcond26.not, label %outer.exit, label %outer.loop
+
+outer.exit:
+  ret void
+}
+
+
+define void @outer_known_tc64(ptr nocapture noundef %a, ptr nocapture noundef 
readonly %b, i64 noundef %n) {
+; CHECK-LABEL: LV: Checking a loop in 'outer_known_tc64'
+; CHECK:  Calculating cost of runtime checks:
+; CHECK:  Total cost of runtime checks: 6
+; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16
+entry:
+  br label %outer.loop
+
+outer.loop:
+  %outer.iv = phi i64 [ %outer.iv.next, %inner.exit ], [ 0, %entry ]
+  %mul.us = mul nsw i64 %outer.iv, %n
+  br label %inner.loop
+
+inner.loop:
+  %inner.iv = phi i64 [ 0, %outer.loop ], [ %inner.iv.next, %inner.loop ]
+  %add.us = add nuw nsw i64 %inner.iv, %mul.us
+  %arrayidx.us = getelementptr inbounds i8, ptr %b, i64 %add.us
+  %0 = load i8, ptr %arrayidx.us, align 1
+  %arrayidx7.us = getelementptr inbounds i8, ptr %a, i64 %add.us
+  %1 = load i8, ptr %arrayidx7.us, align 1
+  %add9.us = add i8 %1, %0
+  store i8 %add9.us, ptr %arrayidx7.us, align 1
+  %inner.iv.next = add nuw nsw i64 %inner.iv, 1
+  %exitcond.not = icmp eq i64 %inner.iv.next, %n
+  br i1 %exitcond.not, label %inner.exit, label %inner.loop
+
+inner.exit:
+  %outer.iv.next = add nuw nsw i64 %outer.iv, 1
+  %exitcond26.not = icmp eq i64 %outer.iv.next, 64
+  br i1 %exitcond26.not, label %outer.exit, label %outer.loop
+
+outer.exit:
+  ret void
+}
+
+
+defi

[llvm] [clang-tools-extra] [clang] [LoopVectorize] Refine runtime memory check costs when there is an outer loop (PR #76034)

2024-01-18 Thread David Sherwood via cfe-commits


@@ -2076,16 +2081,61 @@ class GeneratedRTChecks {
 LLVM_DEBUG(dbgs() << "  " << C << "  for " << I << "\n");
 RTCheckCost += C;
   }
-if (MemCheckBlock)
+if (MemCheckBlock) {
+  InstructionCost MemCheckCost = 0;
   for (Instruction &I : *MemCheckBlock) {
 if (MemCheckBlock->getTerminator() == &I)
   continue;
 InstructionCost C =
 TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput);
 LLVM_DEBUG(dbgs() << "  " << C << "  for " << I << "\n");
-RTCheckCost += C;
+MemCheckCost += C;
   }
 
+  // If the runtime memory checks are being created inside an outer loop
+  // we should find out if these checks are outer loop invariant. If so,
+  // the checks will likely be hoisted out and so the effective cost will
+  // reduce according to the outer loop trip count.
+  if (OuterLoop) {
+ScalarEvolution *SE = MemCheckExp.getSE();
+// TODO: We could refine this further by analysing every individual
+// memory check, since there could be a mixture of loop variant and
+// invariant checks that mean the final condition is variant. However,
+// I think it would need further analysis to prove this is beneficial.
+const SCEV *Cond = SE->getSCEV(MemRuntimeCheckCond);
+if (SE->isLoopInvariant(Cond, OuterLoop)) {
+  // It seems reasonable to assume that we can reduce the effective
+  // cost of the checks even when we know nothing about the trip
+  // count. Here I've assumed that the outer loop executes at least
+  // twice.
+  unsigned BestTripCount = 2;
+
+  // If exact trip count is known use that.
+  if (unsigned SmallTC = SE->getSmallConstantTripCount(OuterLoop))
+BestTripCount = SmallTC;
+  else if (LoopVectorizeWithBlockFrequency) {
+// Else use profile data if available.
+if (auto EstimatedTC = getLoopEstimatedTripCount(OuterLoop))
+  BestTripCount = *EstimatedTC;
+  }
+
+  InstructionCost NewMemCheckCost = MemCheckCost / BestTripCount;
+
+  // Let's ensure the cost is always at least 1.
+  NewMemCheckCost = std::max(*NewMemCheckCost.getValue(), (long)1);

david-arm wrote:

Good spot! I hope I've fixed it now. :)

https://github.com/llvm/llvm-project/pull/76034
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [LTO] Fix Veclib flags correctly pass to LTO flags (PR #78749)

2024-01-22 Thread David Sherwood via cfe-commits


@@ -31,3 +31,31 @@
 
 // RUN: %clang -fveclib=Accelerate %s -nodefaultlibs -target 
arm64-apple-ios8.0.0 -### 2>&1 | FileCheck 
--check-prefix=CHECK-LINK-NODEFAULTLIBS %s
 // CHECK-LINK-NODEFAULTLIBS-NOT: "-framework" "Accelerate"
+
+
+/* Verify that the correct vector library is passed to LTO flags. */
+
+
+// RUN: %clang -### -fveclib=none -flto %s -v 2>&1  | FileCheck -check-prefix 
CHECK-LTO-NOLIB %s
+// CHECK-LTO-NOLIB: "-plugin-opt=-vector-library=none"
+
+// RUN: %clang -### -fveclib=Accelerate -flto %s -v 2>&1  | FileCheck 
-check-prefix CHECK-LTO-ACCELERATE %s
+// CHECK-LTO-ACCELERATE: "-plugin-opt=-vector-library=Accelerate"
+
+// RUN: %clang -### -fveclib=LIBMVEC -flto %s -v 2>&1  | FileCheck 
-check-prefix CHECK-LTO-LIBMVEC %s
+// CHECK-LTO-LIBMVEC: "-plugin-opt=-vector-library=LIBMVEC-X86"
+
+// RUN: %clang -### -fveclib=MASSV -flto %s -v 2>&1  | FileCheck -check-prefix 
CHECK-LTO-MASSV %s
+// CHECK-LTO-MASSV: "-plugin-opt=-vector-library=MASSV"
+
+// RUN: not %clang -### -fveclib=SVML -flto %s -v 2>&1  | FileCheck 
-check-prefix CHECK-LTO-SVML %s
+// CHECK-LTO-SVML: "-plugin-opt=-vector-library=SVML"
+
+// RUN: %clang -### -fveclib=SLEEF -flto %s -v 2>&1  | FileCheck -check-prefix 
CHECK-LTO-SLEEF %s
+// CHECK-LTO-SLEEF: "-plugin-opt=-vector-library=sleefgnuabi"
+
+// RUN: %clang -### -fveclib=Darwin_libsystem_m -flto %s -v 2>&1  | FileCheck 
-check-prefix CHECK-LTO-DARWIN %s
+// CHECK-LTO-DARWIN: "-plugin-opt=-vector-library=Darwin_libsystem_m"
+
+// RUN: %clang -### -fveclib=ArmPL -flto %s -v 2>&1  | FileCheck -check-prefix 
CHECK-LTO-ARMPL %s

david-arm wrote:

Looks like `--target=aarch64-none-none` is needed for SLEEF and ArmPL perhaps? 
In the first 8 RUN lines it looks like we don't specify the target except for 
those cases.

https://github.com/llvm/llvm-project/pull/78749
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [LTO] Fix Veclib flags correctly pass to LTO flags (PR #78749)

2024-01-22 Thread David Sherwood via cfe-commits


@@ -783,6 +783,28 @@ void tools::addLTOOptions(const ToolChain &ToolChain, 
const ArgList &Args,
  "-generate-arange-section"));
   }
 
+  // Pass vector library arguments to LTO.
+  Arg *ArgVecLib = Args.getLastArg(options::OPT_fveclib);
+  if (ArgVecLib && ArgVecLib->getNumValues() == 1) {
+// Map the vector library names from clang front-end to opt front-end. The
+// values are taken from the TargetLibraryInfo class command line options.
+std::optional OptVal =
+llvm::StringSwitch>(ArgVecLib->getValue())

david-arm wrote:

Is it possible to refactor and reuse existing TargetLibraryInfo code, i.e. 
create a common static function that maps the values so that it can be called 
in multiple places?

https://github.com/llvm/llvm-project/pull/78749
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [LTO] Fix Veclib flags correctly pass to LTO flags (PR #78749)

2024-01-22 Thread David Sherwood via cfe-commits


@@ -783,6 +783,28 @@ void tools::addLTOOptions(const ToolChain &ToolChain, 
const ArgList &Args,
  "-generate-arange-section"));
   }
 
+  // Pass vector library arguments to LTO.
+  Arg *ArgVecLib = Args.getLastArg(options::OPT_fveclib);
+  if (ArgVecLib && ArgVecLib->getNumValues() == 1) {
+// Map the vector library names from clang front-end to opt front-end. The
+// values are taken from the TargetLibraryInfo class command line options.
+std::optional OptVal =
+llvm::StringSwitch>(ArgVecLib->getValue())

david-arm wrote:

Yes I think that would work, i.e. having a static function in 
TargetLibraryInfo.h that can be called in two places and doesn't have a 
dependency on the component/library. Having said that, I won't hold this patch 
up for this if it's too difficult!

https://github.com/llvm/llvm-project/pull/78749
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [CXXNameMangler] Correct the mangling of SVE ACLE types within function names. (PR #69460)

2023-10-19 Thread David Sherwood via cfe-commits

https://github.com/david-arm approved this pull request.

LGTM! An outstanding work of art @paulwalker-arm!

https://github.com/llvm/llvm-project/pull/69460
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [Clang][SME2] Add multi-vector add/sub builtins (PR #69725)

2023-10-20 Thread David Sherwood via cfe-commits

https://github.com/david-arm edited 
https://github.com/llvm/llvm-project/pull/69725
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [Clang][SME2] Add multi-vector add/sub builtins (PR #69725)

2023-10-20 Thread David Sherwood via cfe-commits


@@ -9893,24 +9888,37 @@ Value *CodeGenFunction::FormSVEBuiltinResult(Value 
*Call) {
   return Call;
 }
 
-Value *CodeGenFunction::EmitAArch64SVEBuiltinExpr(unsigned BuiltinID,
-  const CallExpr *E) {
+void CodeGenFunction::GetAArch64SMEProcessedOperands(

david-arm wrote:

I wonder if actually this is better named as GetAArch64SVEProcessedOperands 
because if we have to choose a name that's common to both the SME and SVE 
builtins, choosing SVE might make more sense. That's because we're specifically 
dealing with scalable vectors in general here and not something that's 
intrinsically linked to SME.

https://github.com/llvm/llvm-project/pull/69725
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [Clang][SME2] Add multi-vector add/sub builtins (PR #69725)

2023-10-20 Thread David Sherwood via cfe-commits


@@ -1016,29 +1021,24 @@ std::string Intrinsic::mangleName(ClassKind LocalCK) 
const {
  getMergeSuffix();
 }
 
-void Intrinsic::emitIntrinsic(raw_ostream &OS, SVEEmitter &Emitter) const {
+void Intrinsic::emitIntrinsic(raw_ostream &OS, ACLEKind Kind) const {
   bool IsOverloaded = getClassKind() == ClassG && getProto().size() > 1;
 
   std::string FullName = mangleName(ClassS);
   std::string ProtoName = mangleName(getClassKind());
   std::string SMEAttrs = "";
 
-  if (Flags & Emitter.getEnumValueForFlag("IsStreaming"))
-SMEAttrs += ", arm_streaming";
-  if (Flags & Emitter.getEnumValueForFlag("IsStreamingCompatible"))
-SMEAttrs += ", arm_streaming_compatible";
-  if (Flags & Emitter.getEnumValueForFlag("IsSharedZA"))
-SMEAttrs += ", arm_shared_za";
-  if (Flags & Emitter.getEnumValueForFlag("IsPreservesZA"))
-SMEAttrs += ", arm_preserves_za";
-
   OS << (IsOverloaded ? "__aio " : "__ai ")
- << "__attribute__((__clang_arm_builtin_alias("
- << (SMEAttrs.empty() ? "__builtin_sve_" : "__builtin_sme_")
- << FullName << ")";
-  if (!SMEAttrs.empty())
-OS << SMEAttrs;

david-arm wrote:

It looks like we're no longer printing out the attributes for the builtin - is 
this because the attributes are dealt with explicitly elsewhere in clang and so 
they are no longer needed?

https://github.com/llvm/llvm-project/pull/69725
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [Clang][SME2] Add multi-vector add/sub builtins (PR #69725)

2023-10-20 Thread David Sherwood via cfe-commits

https://github.com/david-arm commented:

I've not done an exhaustive review, but thought I'd leave the comments I have 
so far!

https://github.com/llvm/llvm-project/pull/69725
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [Clang][SME2] Add multi-vector add/sub builtins (PR #69725)

2023-10-20 Thread David Sherwood via cfe-commits


@@ -10272,29 +10291,13 @@ Value 
*CodeGenFunction::EmitAArch64SMEBuiltinExpr(unsigned BuiltinID,
   getContext().GetBuiltinType(BuiltinID, Error, &ICEArguments);

david-arm wrote:

Do we still need this code given we're now checking the ICE arguments in 
GetAArch64SMEProcessedOperands?

https://github.com/llvm/llvm-project/pull/69725
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [Clang][SME2] Add multi-vector add/sub builtins (PR #69725)

2023-10-20 Thread David Sherwood via cfe-commits


@@ -9893,24 +9888,37 @@ Value *CodeGenFunction::FormSVEBuiltinResult(Value 
*Call) {
   return Call;
 }
 
-Value *CodeGenFunction::EmitAArch64SVEBuiltinExpr(unsigned BuiltinID,
-  const CallExpr *E) {
+void CodeGenFunction::GetAArch64SMEProcessedOperands(
+unsigned BuiltinID, const CallExpr *E, SmallVectorImpl &Ops,
+SVETypeFlags TypeFlags) {
   // Find out if any arguments are required to be integer constant expressions.
   unsigned ICEArguments = 0;
   ASTContext::GetBuiltinTypeError Error;
   getContext().GetBuiltinType(BuiltinID, Error, &ICEArguments);
   assert(Error == ASTContext::GE_None && "Should not codegen an error");
 
-  llvm::Type *Ty = ConvertType(E->getType());
-  if (BuiltinID >= SVE::BI__builtin_sve_reinterpret_s8_s8 &&
-  BuiltinID <= SVE::BI__builtin_sve_reinterpret_f64_f64) {
-Value *Val = EmitScalarExpr(E->getArg(0));
-return EmitSVEReinterpret(Val, Ty);
-  }
+  bool IsTupleGetOrSet = TypeFlags.isTupleSet() || TypeFlags.isTupleGet();
 
-  llvm::SmallVector Ops;
   for (unsigned i = 0, e = E->getNumArgs(); i != e; i++) {
-if ((ICEArguments & (1 << i)) == 0)
+if (!IsTupleGetOrSet && (ICEArguments & (1 << i)) == 0) {

david-arm wrote:

Perhaps you can create a temp variable and reuse it so it's a bit clearer, i.e.

```
  bool IsICE = ICEArguments & (1 << i);
  if (!IsTupleGetOrSet && !IsICE) {
  ...
  } else if (!IsICE) {
  ...
```

https://github.com/llvm/llvm-project/pull/69725
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [Clang][SME2] Add multi-vector add/sub builtins (PR #69725)

2023-10-20 Thread David Sherwood via cfe-commits


@@ -9893,24 +9888,37 @@ Value *CodeGenFunction::FormSVEBuiltinResult(Value 
*Call) {
   return Call;
 }
 
-Value *CodeGenFunction::EmitAArch64SVEBuiltinExpr(unsigned BuiltinID,
-  const CallExpr *E) {
+void CodeGenFunction::GetAArch64SMEProcessedOperands(
+unsigned BuiltinID, const CallExpr *E, SmallVectorImpl &Ops,
+SVETypeFlags TypeFlags) {
   // Find out if any arguments are required to be integer constant expressions.
   unsigned ICEArguments = 0;
   ASTContext::GetBuiltinTypeError Error;
   getContext().GetBuiltinType(BuiltinID, Error, &ICEArguments);
   assert(Error == ASTContext::GE_None && "Should not codegen an error");
 
-  llvm::Type *Ty = ConvertType(E->getType());
-  if (BuiltinID >= SVE::BI__builtin_sve_reinterpret_s8_s8 &&
-  BuiltinID <= SVE::BI__builtin_sve_reinterpret_f64_f64) {
-Value *Val = EmitScalarExpr(E->getArg(0));
-return EmitSVEReinterpret(Val, Ty);
-  }
+  bool IsTupleGetOrSet = TypeFlags.isTupleSet() || TypeFlags.isTupleGet();
 
-  llvm::SmallVector Ops;
   for (unsigned i = 0, e = E->getNumArgs(); i != e; i++) {
-if ((ICEArguments & (1 << i)) == 0)

david-arm wrote:

Might be worth adding a comment explaining why we explicitly ignore tuple 
get/set functions?

https://github.com/llvm/llvm-project/pull/69725
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [Clang][SME2] Add multi-vector add/sub builtins (PR #69725)

2023-10-25 Thread David Sherwood via cfe-commits

https://github.com/david-arm edited 
https://github.com/llvm/llvm-project/pull/69725
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [Clang][SME2] Add multi-vector add/sub builtins (PR #69725)

2023-10-25 Thread David Sherwood via cfe-commits

https://github.com/david-arm commented:

This looks a lot better now @kmclaughlin-arm - thanks for the changes! I just 
have a couple of comments about the tests that I missed previously...

https://github.com/llvm/llvm-project/pull/69725
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [Clang][SME2] Add multi-vector add/sub builtins (PR #69725)

2023-10-25 Thread David Sherwood via cfe-commits


@@ -0,0 +1,418 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+
+// REQUIRES: aarch64-registered-target
+
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 
-target-feature +sme-i16i64 -target-feature +sme-f64f64 -target-feature +sve -S 
-disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p 
mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 
-target-feature +sme-i16i64 -target-feature +sme-f64f64 -target-feature +sve -S 
-disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p 
mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu 
-target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 
-target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | 
opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu 
-target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 
-target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x 
c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s 
-check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 
-target-feature +sme-i16i64 -target-feature +sme-f64f64 -target-feature +sve -S 
-disable-O0-optnone -Werror -Wall -o /dev/null %s
+
+#include 
+
+#ifdef SVE_OVERLOADED_FORMS
+// A simple used,unused... macro, long enough to represent any SVE builtin.
+#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED,A5) A1##A3##A5
+#else
+#define SVE_ACLE_FUNC(A1,A2,A3,A4,A5) A1##A2##A3##A4##A5
+#endif
+
+//
+// Single-Multi
+//
+
+// x2
+// CHECK-LABEL: @test_svsub_write_single2_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:[[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 7
+// CHECK-NEXT:[[TMP0:%.*]] = tail call  
@llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0)
+// CHECK-NEXT:[[TMP1:%.*]] = tail call  
@llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4)
+// CHECK-NEXT:tail call void 
@llvm.aarch64.sme.sub.write.single.za.vg1x2.nxv4i32(i32 [[ADD]],  [[TMP0]],  [[TMP1]],  [[ZM:%.*]])
+// CHECK-NEXT:ret void
+//
+// CPP-CHECK-LABEL: 
@_Z28test_svsub_write_single2_u32j12svuint32x2_tu12__SVUint32_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:[[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 7
+// CPP-CHECK-NEXT:[[TMP0:%.*]] = tail call  
@llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:[[TMP1:%.*]] = tail call  
@llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4)
+// CPP-CHECK-NEXT:tail call void 
@llvm.aarch64.sme.sub.write.single.za.vg1x2.nxv4i32(i32 [[ADD]],  [[TMP0]],  [[TMP1]],  [[ZM:%.*]])
+// CPP-CHECK-NEXT:ret void
+//
+void test_svsub_write_single2_u32(uint32_t slice_base, svuint32x2_t zn, 
svuint32_t zm) {
+  SVE_ACLE_FUNC(svsub_write,_single,_za32,_u32,_vg1x2)(slice_base + 7, zn, zm);
+}
+
+// CHECK-LABEL: @test_svsub_write_single2_u64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:[[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 7
+// CHECK-NEXT:[[TMP0:%.*]] = tail call  
@llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0)
+// CHECK-NEXT:[[TMP1:%.*]] = tail call  
@llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2)
+// CHECK-NEXT:tail call void 
@llvm.aarch64.sme.sub.write.single.za.vg1x2.nxv2i64(i32 [[ADD]],  [[TMP0]],  [[TMP1]],  [[ZM:%.*]])
+// CHECK-NEXT:ret void
+//
+// CPP-CHECK-LABEL: 
@_Z28test_svsub_write_single2_u64j12svuint64x2_tu12__SVUint64_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:[[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 7
+// CPP-CHECK-NEXT:[[TMP0:%.*]] = tail call  
@llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:[[TMP1:%.*]] = tail call  
@llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2)
+// CPP-CHECK-NEXT:tail call void 
@llvm.aarch64.sme.sub.write.single.za.vg1x2.nxv2i64(i32 [[ADD]],  [[TMP0]],  [[TMP1]],  [[ZM:%.*]])
+// CPP-CHECK-NEXT:ret void
+//
+void test_svsub_write_single2_u64(uint32_t slice_base, svuint64x2_t zn, 
svuint64_t zm) {
+  SVE_ACLE_FUNC(svsub_write,_single,_za64,_u64,_vg1x2)(slice_base + 7, zn, zm);
+}
+
+// x4
+
+// CHECK-LABEL: @test_svsub_write_single4_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:[[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 7
+// CHECK-NEXT:[[TMP0:%.*]] = tail call  
@llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0)
+// CHECK-NEXT:[[TMP1:%.*]] = tail call  
@llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4)
+// CHECK-NEXT:[[TMP2:%.*]] = tail call  
@llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8)
+// CHECK-NEXT:[[TMP3:%.*]] = tail call  
@llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12)
+// CHECK-NEXT:tail call void 
@llvm.aarch64.sme.sub.write.single.za.vg1x4.nxv4i32(i32 [[ADD]],  [[TMP0]], 

  1   2   >