On Tue, 24 May 2022 17:55:24 PDT (-0700), Vineet Gupta wrote:
On 7/22/21 15:41, Christoph Muellner via Gcc-patches wrote:
This patch enables the overlap-by-pieces feature of the by-pieces
infrastructure for inlining builtins in case the target has set
riscv_slow_unaligned_access_p to false.
An example to demonstrate the effect for targets with fast unaligned
access (target's that have slow_unaligned_access set to false) is
the code that is generated for "memset (p, 0, 15);", where the
alignment of p is unknown:
Without overlap_op_by_pieces we get:
8e: 00053023 sd zero,0(a0)
92: 00052423 sw zero,8(a0)
96: 00051623 sh zero,12(a0)
9a: 00050723 sb zero,14(a0)
With overlap_op_by_pieces we get:
7e: 00053023 sd zero,0(a0)
82: 000533a3 sd zero,7(a0)
gcc/ChangeLog:
* config/riscv/riscv.c (riscv_overlap_op_by_pieces): New function.
(TARGET_OVERLAP_OP_BY_PIECES_P): Connect to
riscv_overlap_op_by_pieces.
gcc/testsuite/ChangeLog:
* gcc.target/riscv/builtins-overlap-1.c: New test.
* gcc.target/riscv/builtins-overlap-2.c: New test.
* gcc.target/riscv/builtins-overlap-3.c: New test.
* gcc.target/riscv/builtins-overlap-4.c: New test.
* gcc.target/riscv/builtins-overlap-5.c: New test.
* gcc.target/riscv/builtins-overlap-6.c: New test.
* gcc.target/riscv/builtins-overlap-7.c: New test.
* gcc.target/riscv/builtins-overlap-8.c: New test.
* gcc.target/riscv/builtins-strict-align.c: New test.
* gcc.target/riscv/builtins.h: New test.
Signed-off-by: Christoph Muellner <cmuell...@gcc.gnu.org>
Ping, IMO this needs to be (re)considered for trunk.
This goes really nicely with riscv_slow_unaligned_access_p==false, to
elide the unrolled tail copies for trailer word/sword/byte accesses.
@Kito, @Palmer ? Just from codegen pov this seems to be a no brainer
Has anything changed since this was posted?
IIRC the discussion
essentially boiled down to that overlapping store likely being a hard
case on in-order machines (like the C906), but there weren't any
benchmarks or documentation so we could figure that out. I don't see
how this is an obvious win: sure it's fewer ops (and assuming a uniform
distribution fewer misaligned accesses, though I don't know how
reasonable uniform distributions are here), but it's only a small upside
so that hard case would have to be fast in order for this to be better
code.
If someone has benchmarks showing these are actually faster on the C906
(or even some documentation describing how these accesses are handled)
then I'm happy to take the code (with the -Os bit fixed). It shouldn't
be all that hard of a benchmark to run...
foo:
sd zero,0(a0)
sw zero,8(a0)
sh zero,12(a0)
sb zero,14(a0)
vs.
sd zero,0(a0)
sd zero,7(a0)
-Vineet
---
gcc/config/riscv/riscv.c | 11 +++++++++++
.../gcc.target/riscv/builtins-overlap-1.c | 10 ++++++++++
.../gcc.target/riscv/builtins-overlap-2.c | 10 ++++++++++
.../gcc.target/riscv/builtins-overlap-3.c | 10 ++++++++++
.../gcc.target/riscv/builtins-overlap-4.c | 10 ++++++++++
.../gcc.target/riscv/builtins-overlap-5.c | 11 +++++++++++
.../gcc.target/riscv/builtins-overlap-6.c | 13 +++++++++++++
.../gcc.target/riscv/builtins-overlap-7.c | 11 +++++++++++
.../gcc.target/riscv/builtins-overlap-8.c | 11 +++++++++++
.../gcc.target/riscv/builtins-strict-align.c | 10 ++++++++++
gcc/testsuite/gcc.target/riscv/builtins.h | 16 ++++++++++++++++
11 files changed, 123 insertions(+)
create mode 100644 gcc/testsuite/gcc.target/riscv/builtins-overlap-1.c
create mode 100644 gcc/testsuite/gcc.target/riscv/builtins-overlap-2.c
create mode 100644 gcc/testsuite/gcc.target/riscv/builtins-overlap-3.c
create mode 100644 gcc/testsuite/gcc.target/riscv/builtins-overlap-4.c
create mode 100644 gcc/testsuite/gcc.target/riscv/builtins-overlap-5.c
create mode 100644 gcc/testsuite/gcc.target/riscv/builtins-overlap-6.c
create mode 100644 gcc/testsuite/gcc.target/riscv/builtins-overlap-7.c
create mode 100644 gcc/testsuite/gcc.target/riscv/builtins-overlap-8.c
create mode 100644 gcc/testsuite/gcc.target/riscv/builtins-strict-align.c
create mode 100644 gcc/testsuite/gcc.target/riscv/builtins.h
diff --git a/gcc/config/riscv/riscv.c b/gcc/config/riscv/riscv.c
index 576960bb37c..98c76ba657a 100644
--- a/gcc/config/riscv/riscv.c
+++ b/gcc/config/riscv/riscv.c
@@ -5201,6 +5201,14 @@ riscv_slow_unaligned_access (machine_mode, unsigned int)
return riscv_slow_unaligned_access_p;
}
+/* Implement TARGET_OVERLAP_OP_BY_PIECES_P. */
+
+static bool
+riscv_overlap_op_by_pieces (void)
+{
+ return !riscv_slow_unaligned_access_p;
+}
+
/* Implement TARGET_CAN_CHANGE_MODE_CLASS. */
static bool
@@ -5525,6 +5533,9 @@ riscv_asan_shadow_offset (void)
#undef TARGET_SLOW_UNALIGNED_ACCESS
#define TARGET_SLOW_UNALIGNED_ACCESS riscv_slow_unaligned_access
+#undef TARGET_OVERLAP_OP_BY_PIECES_P
+#define TARGET_OVERLAP_OP_BY_PIECES_P riscv_overlap_op_by_pieces
+
#undef TARGET_SECONDARY_MEMORY_NEEDED
#define TARGET_SECONDARY_MEMORY_NEEDED riscv_secondary_memory_needed
diff --git a/gcc/testsuite/gcc.target/riscv/builtins-overlap-1.c
b/gcc/testsuite/gcc.target/riscv/builtins-overlap-1.c
new file mode 100644
index 00000000000..ca51fff0fc6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/builtins-overlap-1.c
@@ -0,0 +1,10 @@
+/* { dg-options "-O2 -mtune=thead-c906 -march=rv64gc -mabi=lp64" } */
+/* { dg-do compile } */
+
+#include "builtins.h"
+
+DO_MEMSET0_N(7)
+
+/* { dg-final { scan-assembler-times "sw\tzero,0" 1 } } */
+/* { dg-final { scan-assembler-times "sw\tzero,3" 1 } } */
+/* { dg-final { scan-assembler-not "sb" } } */
diff --git a/gcc/testsuite/gcc.target/riscv/builtins-overlap-2.c
b/gcc/testsuite/gcc.target/riscv/builtins-overlap-2.c
new file mode 100644
index 00000000000..24b5b254658
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/builtins-overlap-2.c
@@ -0,0 +1,10 @@
+/* { dg-options "-O2 -mtune=thead-c906 -march=rv64gc -mabi=lp64" } */
+/* { dg-do compile } */
+
+#include "builtins.h"
+
+DO_MEMSET0_N(11)
+
+/* { dg-final { scan-assembler-times "sd\tzero,0" 1 } } */
+/* { dg-final { scan-assembler-times "sw\tzero,7" 1 } } */
+/* { dg-final { scan-assembler-not "sb" } } */
diff --git a/gcc/testsuite/gcc.target/riscv/builtins-overlap-3.c
b/gcc/testsuite/gcc.target/riscv/builtins-overlap-3.c
new file mode 100644
index 00000000000..636031cb944
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/builtins-overlap-3.c
@@ -0,0 +1,10 @@
+/* { dg-options "-O2 -mtune=thead-c906 -march=rv64gc -mabi=lp64" } */
+/* { dg-do compile } */
+
+#include "builtins.h"
+
+DO_MEMSET0_N(13)
+
+/* { dg-final { scan-assembler-times "sd\tzero,0" 1 } } */
+/* { dg-final { scan-assembler-times "sd\tzero,5" 1 } } */
+/* { dg-final { scan-assembler-not "sb" } } */
diff --git a/gcc/testsuite/gcc.target/riscv/builtins-overlap-4.c
b/gcc/testsuite/gcc.target/riscv/builtins-overlap-4.c
new file mode 100644
index 00000000000..15d77860050
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/builtins-overlap-4.c
@@ -0,0 +1,10 @@
+/* { dg-options "-O2 -mtune=thead-c906 -march=rv64gc -mabi=lp64" } */
+/* { dg-do compile } */
+
+#include "builtins.h"
+
+DO_MEMSET0_N(15)
+
+/* { dg-final { scan-assembler-times "sd\tzero,0" 1 } } */
+/* { dg-final { scan-assembler-times "sd\tzero,7" 1 } } */
+/* { dg-final { scan-assembler-not "sb" } } */
diff --git a/gcc/testsuite/gcc.target/riscv/builtins-overlap-5.c
b/gcc/testsuite/gcc.target/riscv/builtins-overlap-5.c
new file mode 100644
index 00000000000..faccb301f84
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/builtins-overlap-5.c
@@ -0,0 +1,11 @@
+/* { dg-options "-O2 -mtune=thead-c906 -march=rv64gc -mabi=lp64" } */
+/* { dg-do compile } */
+
+#include "builtins.h"
+
+DO_MEMCPY_N(7)
+
+/* { dg-final { scan-assembler-times "lw" 2 } } */
+/* { dg-final { scan-assembler-times "sw" 2 } } */
+/* { dg-final { scan-assembler-not "lb" } } */
+/* { dg-final { scan-assembler-not "sb" } } */
diff --git a/gcc/testsuite/gcc.target/riscv/builtins-overlap-6.c
b/gcc/testsuite/gcc.target/riscv/builtins-overlap-6.c
new file mode 100644
index 00000000000..51e9b37ba5a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/builtins-overlap-6.c
@@ -0,0 +1,13 @@
+/* { dg-options "-O2 -mtune=thead-c906 -march=rv64gc -mabi=lp64" } */
+/* { dg-do compile } */
+
+#include "builtins.h"
+
+DO_MEMCPY_N(11)
+
+/* { dg-final { scan-assembler-times "ld" 1 } } */
+/* { dg-final { scan-assembler-times "sw" 1 } } */
+/* { dg-final { scan-assembler-times "lw" 1 } } */
+/* { dg-final { scan-assembler-times "sw" 1 } } */
+/* { dg-final { scan-assembler-not "lb" } } */
+/* { dg-final { scan-assembler-not "sb" } } */
diff --git a/gcc/testsuite/gcc.target/riscv/builtins-overlap-7.c
b/gcc/testsuite/gcc.target/riscv/builtins-overlap-7.c
new file mode 100644
index 00000000000..44fdaa398ca
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/builtins-overlap-7.c
@@ -0,0 +1,11 @@
+/* { dg-options "-O2 -mtune=thead-c906 -march=rv64gc -mabi=lp64" } */
+/* { dg-do compile } */
+
+#include "builtins.h"
+
+DO_MEMCPY_N(13)
+
+/* { dg-final { scan-assembler-times "ld" 2 } } */
+/* { dg-final { scan-assembler-times "sd" 2 } } */
+/* { dg-final { scan-assembler-not "lb" } } */
+/* { dg-final { scan-assembler-not "sb" } } */
diff --git a/gcc/testsuite/gcc.target/riscv/builtins-overlap-8.c
b/gcc/testsuite/gcc.target/riscv/builtins-overlap-8.c
new file mode 100644
index 00000000000..61186ae09a2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/builtins-overlap-8.c
@@ -0,0 +1,11 @@
+/* { dg-options "-O2 -mtune=thead-c906 -march=rv64gc -mabi=lp64" } */
+/* { dg-do compile } */
+
+#include "builtins.h"
+
+DO_MEMCPY_N(15)
+
+/* { dg-final { scan-assembler-times "ld" 2 } } */
+/* { dg-final { scan-assembler-times "sd" 2 } } */
+/* { dg-final { scan-assembler-not "lb" } } */
+/* { dg-final { scan-assembler-not "sb" } } */
diff --git a/gcc/testsuite/gcc.target/riscv/builtins-strict-align.c
b/gcc/testsuite/gcc.target/riscv/builtins-strict-align.c
new file mode 100644
index 00000000000..5d06c6eea08
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/builtins-strict-align.c
@@ -0,0 +1,10 @@
+/* { dg-options "-O2 -mtune=thead-c906 -march=rv64gc -mabi=lp64
-mstrict-align" } */
+/* { dg-do compile } */
+
+#include "builtins.h"
+
+DO_MEMSET0_N(15)
+
+/* { dg-final { scan-assembler-times "sb\tzero" 15 } } */
+/* { dg-final { scan-assembler-not "sw" } } */
+/* { dg-final { scan-assembler-not "sd" } } */
diff --git a/gcc/testsuite/gcc.target/riscv/builtins.h
b/gcc/testsuite/gcc.target/riscv/builtins.h
new file mode 100644
index 00000000000..22b2800d464
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/builtins.h
@@ -0,0 +1,16 @@
+#ifndef BUILTINS_H
+#define BUILTINS_H
+
+#define DO_MEMSET0_N(N) \
+void do_memset0_##N (void *p) \
+{ \
+ __builtin_memset (p, 0, N); \
+}
+
+#define DO_MEMCPY_N(N) \
+void do_memcpy_##N (void *d, void *s) \
+{ \
+ __builtin_memcpy (d, s, N); \
+}
+
+#endif /* BUILTINS_H */