[PATCH] D159480: [Clang][AArch64] Fine-grained ldp and stp policies.

2023-09-07 Thread Manos Anagnostakis via Phabricator via cfe-commits
manosanag updated this revision to Diff 556154.
manosanag added a comment.

Updated Options.td to provide visibility for the options, because
it caused a regression for my fortran tests after rebasing to
current llvm main branch.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D159480/new/

https://reviews.llvm.org/D159480

Files:
  clang/include/clang/Driver/Options.td
  clang/lib/Driver/ToolChains/Clang.cpp
  clang/lib/Driver/ToolChains/CommonArgs.cpp
  clang/lib/Driver/ToolChains/Flang.cpp
  clang/test/Driver/aarch64-ldp-policy.c
  clang/test/Driver/aarch64-stp-policy.c
  clang/test/Driver/flang/aarch64-ldp-policy.f90
  clang/test/Driver/flang/aarch64-stp-policy.f90
  llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
  llvm/test/CodeGen/AArch64/ldp-aligned.ll
  llvm/test/CodeGen/AArch64/ldp-always.ll
  llvm/test/CodeGen/AArch64/ldp-never.ll
  llvm/test/CodeGen/AArch64/stp-aligned.ll
  llvm/test/CodeGen/AArch64/stp-always.ll
  llvm/test/CodeGen/AArch64/stp-never.ll

Index: llvm/test/CodeGen/AArch64/stp-never.ll
===
--- /dev/null
+++ llvm/test/CodeGen/AArch64/stp-never.ll
@@ -0,0 +1,105 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc < %s -O2 -mtriple=aarch64 -mcpu=generic -aarch64-stp-policy=never | FileCheck %s
+
+define ptr @stp_aligned_int32_t(ptr %0, i32 %1) #0 {
+; CHECK-LABEL: stp_aligned_int32_t:
+; CHECK:   // %bb.0:
+; CHECK-NEXT:and x0, x0, #0xffc0
+; CHECK-NEXT:str w1, [x0]
+; CHECK-NEXT:str w1, [x0, #4]
+; CHECK-NEXT:ret
+  %3 = ptrtoint ptr %0 to i64
+  %4 = and i64 %3, -64
+  %5 = inttoptr i64 %4 to ptr
+  store i32 %1, ptr %5, align 64
+  %6 = getelementptr inbounds i32, ptr %5, i64 1
+  store i32 %1, ptr %6, align 4
+  ret ptr %5
+}
+
+define ptr @stp_aligned_int64_t(ptr %0, i64 %1) #0 {
+; CHECK-LABEL: stp_aligned_int64_t:
+; CHECK:   // %bb.0:
+; CHECK-NEXT:and x0, x0, #0xff80
+; CHECK-NEXT:str x1, [x0]
+; CHECK-NEXT:str x1, [x0, #8]
+; CHECK-NEXT:ret
+  %3 = ptrtoint ptr %0 to i64
+  %4 = and i64 %3, -128
+  %5 = inttoptr i64 %4 to ptr
+  store i64 %1, ptr %5, align 128
+  %6 = getelementptr inbounds i64, ptr %5, i64 1
+  store i64 %1, ptr %6, align 8
+  ret ptr %5
+}
+
+define ptr @stp_aligned_v4si(ptr %0, <4 x i32> %1) #0 {
+; CHECK-LABEL: stp_aligned_v4si:
+; CHECK:   // %bb.0:
+; CHECK-NEXT:and x0, x0, #0xff00
+; CHECK-NEXT:str q0, [x0]
+; CHECK-NEXT:str q0, [x0, #16]
+; CHECK-NEXT:ret
+  %3 = ptrtoint ptr %0 to i64
+  %4 = and i64 %3, -256
+  %5 = inttoptr i64 %4 to ptr
+  store <4 x i32> %1, ptr %5, align 256
+  %6 = getelementptr inbounds <4 x i32>, ptr %5, i64 1
+  store <4 x i32> %1, ptr %6, align 16
+  ret ptr %5
+}
+
+define ptr @stp_unaligned_int32_t(ptr %0, i32 %1) #0 {
+; CHECK-LABEL: stp_unaligned_int32_t:
+; CHECK:   // %bb.0:
+; CHECK-NEXT:and x8, x0, #0xffc0
+; CHECK-NEXT:orr x0, x8, #0x4
+; CHECK-NEXT:str w1, [x8, #4]
+; CHECK-NEXT:str w1, [x8, #8]
+; CHECK-NEXT:ret
+  %3 = ptrtoint ptr %0 to i64
+  %4 = and i64 %3, -64
+  %5 = inttoptr i64 %4 to ptr
+  %6 = getelementptr inbounds i32, ptr %5, i64 1
+  store i32 %1, ptr %6, align 4
+  %7 = getelementptr inbounds i32, ptr %5, i64 2
+  store i32 %1, ptr %7, align 8
+  ret ptr %6
+}
+
+define ptr @stp_unaligned_int64_t(ptr %0, i64 %1) #0 {
+; CHECK-LABEL: stp_unaligned_int64_t:
+; CHECK:   // %bb.0:
+; CHECK-NEXT:and x8, x0, #0xff80
+; CHECK-NEXT:orr x0, x8, #0x8
+; CHECK-NEXT:str x1, [x8, #8]
+; CHECK-NEXT:str x1, [x8, #16]
+; CHECK-NEXT:ret
+  %3 = ptrtoint ptr %0 to i64
+  %4 = and i64 %3, -128
+  %5 = inttoptr i64 %4 to ptr
+  %6 = getelementptr inbounds i64, ptr %5, i64 1
+  store i64 %1, ptr %6, align 8
+  %7 = getelementptr inbounds i64, ptr %5, i64 2
+  store i64 %1, ptr %7, align 16
+  ret ptr %6
+}
+
+define ptr @stp_unaligned_v4si(ptr %0, <4 x i32> %1) #0 {
+; CHECK-LABEL: stp_unaligned_v4si:
+; CHECK:   // %bb.0:
+; CHECK-NEXT:and x8, x0, #0xff00
+; CHECK-NEXT:orr x0, x8, #0x10
+; CHECK-NEXT:str q0, [x8, #16]
+; CHECK-NEXT:str q0, [x8, #32]
+; CHECK-NEXT:ret
+  %3 = ptrtoint ptr %0 to i64
+  %4 = and i64 %3, -256
+  %5 = inttoptr i64 %4 to ptr
+  %6 = getelementptr inbounds <4 x i32>, ptr %5, i64 1
+  store <4 x i32> %1, ptr %6, align 16
+  %7 = getelementptr inbounds <4 x i32>, ptr %5, i64 2
+  store <4 x i32> %1, ptr %7, align 32
+  ret ptr %6
+}
+
Index: llvm/test/CodeGen/AArch64/stp-always.ll
===
--- /dev/null
+++ llvm/test/CodeGen/AArch64/stp-always.ll
@@ -0,0 +1,99 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc < %s -O2 -mtriple=aarch64 -mcpu=generic -aarch64-stp-policy=always | FileCheck %s
+

[PATCH] D159480: [Clang][AArch64] Fine-grained ldp and stp policies.

2023-09-08 Thread Manos Anagnostakis via Phabricator via cfe-commits
manosanag added a comment.

Hello Dave,

thanks for replying.

Yes, this is an optimization.

On some AArch64 cores, including Ampere's ampere1 architecture that this is 
targeted for, load/store pair instructions are faster compared to simple 
loads/stores only when the alignment of the pair is at least twice that of the 
individual element being loaded. Based on the performance of various 
benchmarks, emitting ldp/stp instructions was disabled on GCC at some point 
(discussion is 
https://gcc.gnu.org/pipermail/gcc-patches/2023-April/615672.html). This patch 
improves on that and offers control over when the instructions are used.

Similar patch with the same flags has been recently submitted for review in the 
GCC mailing lists 
(https://gcc.gnu.org/pipermail/gcc-patches/2023-August/628590.html).

I have a fix ready for the fortran regressions shown by autotesting. I can 
include some of this information to the commit message of the diff.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D159480/new/

https://reviews.llvm.org/D159480

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D159480: [Clang][AArch64] Fine-grained ldp and stp policies.

2023-09-12 Thread Manos Anagnostakis via Phabricator via cfe-commits
manosanag added a comment.

Moved to https://github.com/llvm/llvm-project/pull/66098.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D159480/new/

https://reviews.llvm.org/D159480

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D159480: [Clang][AArch64] Fine-grained ldp and stp policies.

2023-09-07 Thread Manos Anagnostakis via Phabricator via cfe-commits
manosanag created this revision.
Herald added subscribers: hiraditya, kristof.beyls.
Herald added a reviewer: sscalpone.
Herald added a project: All.
manosanag requested review of this revision.
Herald added subscribers: llvm-commits, cfe-commits, MaskRay.
Herald added projects: clang, LLVM.

This patch enables fine-grained tuning control for ldp and stp.

It provides two new and concrete command-line options -aarch64-ldp-policy
and -aarch64-stp-policy to give the ability to control load and store
policies seperately with both clang and flang-new frontends including
when using -flto.

The accepted values for both options are:

- default: Use the ldp/stp policy currently used by the compiler (always).
- always: Emit ldp/stp regardless of alignment.
- never: Do not emit ldp/stp.
- aligned: In order to emit ldp/stp, first check if the load/store will be 
aligned to 2 * element_size.


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D159480

Files:
  clang/include/clang/Driver/Options.td
  clang/lib/Driver/ToolChains/Clang.cpp
  clang/lib/Driver/ToolChains/CommonArgs.cpp
  clang/lib/Driver/ToolChains/Flang.cpp
  clang/test/Driver/aarch64-ldp-policy.c
  clang/test/Driver/aarch64-stp-policy.c
  clang/test/Driver/flang/aarch64-ldp-policy.f90
  clang/test/Driver/flang/aarch64-stp-policy.f90
  llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
  llvm/test/CodeGen/AArch64/ldp-aligned.ll
  llvm/test/CodeGen/AArch64/ldp-always.ll
  llvm/test/CodeGen/AArch64/ldp-never.ll
  llvm/test/CodeGen/AArch64/stp-aligned.ll
  llvm/test/CodeGen/AArch64/stp-always.ll
  llvm/test/CodeGen/AArch64/stp-never.ll

Index: llvm/test/CodeGen/AArch64/stp-never.ll
===
--- /dev/null
+++ llvm/test/CodeGen/AArch64/stp-never.ll
@@ -0,0 +1,105 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc < %s -O2 -mtriple=aarch64 -mcpu=generic -aarch64-stp-policy=never | FileCheck %s
+
+define ptr @stp_aligned_int32_t(ptr %0, i32 %1) #0 {
+; CHECK-LABEL: stp_aligned_int32_t:
+; CHECK:   // %bb.0:
+; CHECK-NEXT:and x0, x0, #0xffc0
+; CHECK-NEXT:str w1, [x0]
+; CHECK-NEXT:str w1, [x0, #4]
+; CHECK-NEXT:ret
+  %3 = ptrtoint ptr %0 to i64
+  %4 = and i64 %3, -64
+  %5 = inttoptr i64 %4 to ptr
+  store i32 %1, ptr %5, align 64
+  %6 = getelementptr inbounds i32, ptr %5, i64 1
+  store i32 %1, ptr %6, align 4
+  ret ptr %5
+}
+
+define ptr @stp_aligned_int64_t(ptr %0, i64 %1) #0 {
+; CHECK-LABEL: stp_aligned_int64_t:
+; CHECK:   // %bb.0:
+; CHECK-NEXT:and x0, x0, #0xff80
+; CHECK-NEXT:str x1, [x0]
+; CHECK-NEXT:str x1, [x0, #8]
+; CHECK-NEXT:ret
+  %3 = ptrtoint ptr %0 to i64
+  %4 = and i64 %3, -128
+  %5 = inttoptr i64 %4 to ptr
+  store i64 %1, ptr %5, align 128
+  %6 = getelementptr inbounds i64, ptr %5, i64 1
+  store i64 %1, ptr %6, align 8
+  ret ptr %5
+}
+
+define ptr @stp_aligned_v4si(ptr %0, <4 x i32> %1) #0 {
+; CHECK-LABEL: stp_aligned_v4si:
+; CHECK:   // %bb.0:
+; CHECK-NEXT:and x0, x0, #0xff00
+; CHECK-NEXT:str q0, [x0]
+; CHECK-NEXT:str q0, [x0, #16]
+; CHECK-NEXT:ret
+  %3 = ptrtoint ptr %0 to i64
+  %4 = and i64 %3, -256
+  %5 = inttoptr i64 %4 to ptr
+  store <4 x i32> %1, ptr %5, align 256
+  %6 = getelementptr inbounds <4 x i32>, ptr %5, i64 1
+  store <4 x i32> %1, ptr %6, align 16
+  ret ptr %5
+}
+
+define ptr @stp_unaligned_int32_t(ptr %0, i32 %1) #0 {
+; CHECK-LABEL: stp_unaligned_int32_t:
+; CHECK:   // %bb.0:
+; CHECK-NEXT:and x8, x0, #0xffc0
+; CHECK-NEXT:orr x0, x8, #0x4
+; CHECK-NEXT:str w1, [x8, #4]
+; CHECK-NEXT:str w1, [x8, #8]
+; CHECK-NEXT:ret
+  %3 = ptrtoint ptr %0 to i64
+  %4 = and i64 %3, -64
+  %5 = inttoptr i64 %4 to ptr
+  %6 = getelementptr inbounds i32, ptr %5, i64 1
+  store i32 %1, ptr %6, align 4
+  %7 = getelementptr inbounds i32, ptr %5, i64 2
+  store i32 %1, ptr %7, align 8
+  ret ptr %6
+}
+
+define ptr @stp_unaligned_int64_t(ptr %0, i64 %1) #0 {
+; CHECK-LABEL: stp_unaligned_int64_t:
+; CHECK:   // %bb.0:
+; CHECK-NEXT:and x8, x0, #0xff80
+; CHECK-NEXT:orr x0, x8, #0x8
+; CHECK-NEXT:str x1, [x8, #8]
+; CHECK-NEXT:str x1, [x8, #16]
+; CHECK-NEXT:ret
+  %3 = ptrtoint ptr %0 to i64
+  %4 = and i64 %3, -128
+  %5 = inttoptr i64 %4 to ptr
+  %6 = getelementptr inbounds i64, ptr %5, i64 1
+  store i64 %1, ptr %6, align 8
+  %7 = getelementptr inbounds i64, ptr %5, i64 2
+  store i64 %1, ptr %7, align 16
+  ret ptr %6
+}
+
+define ptr @stp_unaligned_v4si(ptr %0, <4 x i32> %1) #0 {
+; CHECK-LABEL: stp_unaligned_v4si:
+; CHECK:   // %bb.0:
+; CHECK-NEXT:and x8, x0, #0xff00
+; CHECK-NEXT:orr x0, x8, #0x10
+; CHECK-NEXT:str q0, [x8, #16]
+; CHECK-NEXT:str q0, [x8, #32]
+; CHECK-NEXT:ret
+  %3 = ptrtoint ptr %0 to i64
+  %4 = and i64 %3, -256
+  %5 = inttoptr i64 %4 to ptr
+