If a single instruction can store or move the whole block of memory, use
vector instruction and don't align destination.
gcc/
PR target/121934
* config/i386/i386-expand.cc (ix86_expand_set_or_cpymem): If a
single instruction can store or move the whole block of memory,
use vector instruction and don't align destination.
gcc/testsuite/
PR target/121934
* gcc.target/i386/pr121934-1a.c: New test.
* gcc.target/i386/pr121934-1b.c: Likewise.
* gcc.target/i386/pr121934-2a.c: Likewise.
* gcc.target/i386/pr121934-2b.c: Likewise.
* gcc.target/i386/pr121934-3a.c: Likewise.
* gcc.target/i386/pr121934-3b.c: Likewise.
* gcc.target/i386/pr121934-4a.c: Likewise.
* gcc.target/i386/pr121934-4b.c: Likewise.
* gcc.target/i386/pr121934-5a.c: Likewise.
* gcc.target/i386/pr121934-5b.c: Likewise.
Signed-off-by: H.J. Lu <[email protected]>
---
gcc/config/i386/i386-expand.cc | 62 +++++++++++++--------
gcc/testsuite/gcc.target/i386/pr121934-1a.c | 22 ++++++++
gcc/testsuite/gcc.target/i386/pr121934-1b.c | 7 +++
gcc/testsuite/gcc.target/i386/pr121934-2a.c | 23 ++++++++
gcc/testsuite/gcc.target/i386/pr121934-2b.c | 7 +++
gcc/testsuite/gcc.target/i386/pr121934-3a.c | 23 ++++++++
gcc/testsuite/gcc.target/i386/pr121934-3b.c | 7 +++
gcc/testsuite/gcc.target/i386/pr121934-4a.c | 23 ++++++++
gcc/testsuite/gcc.target/i386/pr121934-4b.c | 7 +++
gcc/testsuite/gcc.target/i386/pr121934-5a.c | 23 ++++++++
gcc/testsuite/gcc.target/i386/pr121934-5b.c | 7 +++
11 files changed, 187 insertions(+), 24 deletions(-)
create mode 100644 gcc/testsuite/gcc.target/i386/pr121934-1a.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr121934-1b.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr121934-2a.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr121934-2b.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr121934-3a.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr121934-3b.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr121934-4a.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr121934-4b.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr121934-5a.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr121934-5b.c
diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index dc26b3452cb..b0b9e6da946 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -9552,9 +9552,20 @@ ix86_expand_set_or_cpymem (rtx dst, rtx src, rtx
count_exp, rtx val_exp,
if (!issetmem)
srcreg = ix86_copy_addr_to_reg (XEXP (src, 0));
+ bool aligned_dstmem = false;
+ unsigned int nunits = issetmem ? STORE_MAX_PIECES : MOVE_MAX;
+ bool single_insn_p = count && count <= nunits;
+ if (single_insn_p)
+ {
+ /* If it can be done with a single instruction, use vector
+ instruction and don't align destination. */
+ alg = vector_loop;
+ noalign = true;
+ dynamic_check = -1;
+ }
+
unroll_factor = 1;
move_mode = word_mode;
- int nunits;
switch (alg)
{
case libcall:
@@ -9576,7 +9587,6 @@ ix86_expand_set_or_cpymem (rtx dst, rtx src, rtx
count_exp, rtx val_exp,
need_zero_guard = true;
unroll_factor = 4;
/* Get the vector mode to move STORE_MAX_PIECES/MOVE_MAX bytes. */
- nunits = issetmem ? STORE_MAX_PIECES : MOVE_MAX;
nunits /= GET_MODE_SIZE (word_mode);
if (nunits > 1)
{
@@ -9629,28 +9639,32 @@ ix86_expand_set_or_cpymem (rtx dst, rtx src, rtx
count_exp, rtx val_exp,
}
gcc_assert (desired_align >= 1 && align >= 1);
- /* Misaligned move sequences handle both prologue and epilogue at once.
- Default code generation results in a smaller code for large alignments
- and also avoids redundant job when sizes are known precisely. */
- misaligned_prologue_used
- = (TARGET_MISALIGNED_MOVE_STRING_PRO_EPILOGUES
- && MAX (desired_align, epilogue_size_needed) <= 32
- && desired_align <= epilogue_size_needed
- && ((desired_align > align && !align_bytes)
- || (!count && epilogue_size_needed > 1)));
-
- /* Destination is aligned after the misaligned prologue. */
- bool aligned_dstmem = misaligned_prologue_used;
-
- if (noalign && !misaligned_prologue_used)
- {
- /* Also use misaligned prologue if alignment isn't needed and
- destination isn't aligned. Since alignment isn't needed,
- the destination after prologue won't be aligned. */
- aligned_dstmem = (GET_MODE_ALIGNMENT (move_mode)
- <= MEM_ALIGN (dst));
- if (!aligned_dstmem)
- misaligned_prologue_used = true;
+ if (!single_insn_p)
+ {
+ /* Misaligned move sequences handle both prologue and epilogue
+ at once. Default code generation results in a smaller code
+ for large alignments and also avoids redundant job when sizes
+ are known precisely. */
+ misaligned_prologue_used
+ = (TARGET_MISALIGNED_MOVE_STRING_PRO_EPILOGUES
+ && MAX (desired_align, epilogue_size_needed) <= 32
+ && desired_align <= epilogue_size_needed
+ && ((desired_align > align && !align_bytes)
+ || (!count && epilogue_size_needed > 1)));
+
+ /* Destination is aligned after the misaligned prologue. */
+ aligned_dstmem = misaligned_prologue_used;
+
+ if (noalign && !misaligned_prologue_used)
+ {
+ /* Also use misaligned prologue if alignment isn't needed and
+ destination isn't aligned. Since alignment isn't needed,
+ the destination after prologue won't be aligned. */
+ aligned_dstmem = (GET_MODE_ALIGNMENT (move_mode)
+ <= MEM_ALIGN (dst));
+ if (!aligned_dstmem)
+ misaligned_prologue_used = true;
+ }
}
/* Do the cheap promotion to allow better CSE across the
diff --git a/gcc/testsuite/gcc.target/i386/pr121934-1a.c
b/gcc/testsuite/gcc.target/i386/pr121934-1a.c
new file mode 100644
index 00000000000..6b6881367db
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr121934-1a.c
@@ -0,0 +1,22 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -fno-tree-dominator-opts -fno-tree-vrp -fno-tree-ccp
-fno-tree-forwprop -fno-tree-pre -fno-tree-fre" } */
+
+extern int f();
+int a, b, c, d[3];
+void g() {
+ int h;
+ if (f()) {
+ if (b)
+ i:
+ c > 0;
+ a = 0;
+ for (h = 0; h < 3; h++) {
+ if (a != 1)
+ __builtin_printf("0\n");
+ d[h] = -1;
+ }
+ goto i;
+ }
+}
+
+/* { dg-final { scan-assembler-not "rep stos" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr121934-1b.c
b/gcc/testsuite/gcc.target/i386/pr121934-1b.c
new file mode 100644
index 00000000000..47381ec3476
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr121934-1b.c
@@ -0,0 +1,7 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -fno-tree-dominator-opts -fno-tree-vrp -fno-tree-ccp
-fno-tree-forwprop -fno-tree-pre -fno-tree-fre
-mmemset-strategy=rep_byte:8192:align,libcall:-1:noalign" } */
+
+#include "pr121934-1a.c"
+
+/* { dg-final { scan-assembler-not "rep stos" } } */
+/* { dg-final { scan-assembler-not "movb\[ \\t\]+\\\$-1" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr121934-2a.c
b/gcc/testsuite/gcc.target/i386/pr121934-2a.c
new file mode 100644
index 00000000000..49def11aa4e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr121934-2a.c
@@ -0,0 +1,23 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -fno-tree-dominator-opts -fno-tree-vrp -fno-tree-ccp
-fno-tree-forwprop -fno-tree-pre -fno-tree-fre" } */
+
+extern int f();
+int a, b, c;
+long long int d[3];
+void g() {
+ int h;
+ if (f()) {
+ if (b)
+ i:
+ c > 0;
+ a = 0;
+ for (h = 0; h < 3; h++) {
+ if (a != 1)
+ __builtin_printf("0\n");
+ d[h] = (long long int) -1;
+ }
+ goto i;
+ }
+}
+
+/* { dg-final { scan-assembler-not "rep stos" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr121934-2b.c
b/gcc/testsuite/gcc.target/i386/pr121934-2b.c
new file mode 100644
index 00000000000..1c634dfe420
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr121934-2b.c
@@ -0,0 +1,7 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -fno-tree-dominator-opts -fno-tree-vrp -fno-tree-ccp
-fno-tree-forwprop -fno-tree-pre -fno-tree-fre
-mmemset-strategy=rep_byte:8192:align,libcall:-1:noalign" } */
+
+#include "pr121934-2a.c"
+
+/* { dg-final { scan-assembler-not "rep stos" } } */
+/* { dg-final { scan-assembler-not "movb\[ \\t\]+\\\$-1" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr121934-3a.c
b/gcc/testsuite/gcc.target/i386/pr121934-3a.c
new file mode 100644
index 00000000000..0c04b69c0d4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr121934-3a.c
@@ -0,0 +1,23 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -fno-tree-dominator-opts -fno-tree-vrp -fno-tree-ccp
-fno-tree-forwprop -fno-tree-pre -fno-tree-fre -msse2" } */
+
+extern int f();
+int a, b, c;
+_BitInt(128) d[3];
+void g() {
+ int h;
+ if (f()) {
+ if (b)
+ i:
+ c > 0;
+ a = 0;
+ for (h = 0; h < 3; h++) {
+ if (a != 1)
+ __builtin_printf("0\n");
+ d[h] = (_BitInt(128)) -1;
+ }
+ goto i;
+ }
+}
+
+/* { dg-final { scan-assembler-not "rep stos" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr121934-3b.c
b/gcc/testsuite/gcc.target/i386/pr121934-3b.c
new file mode 100644
index 00000000000..ff4b0831cea
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr121934-3b.c
@@ -0,0 +1,7 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -fno-tree-dominator-opts -fno-tree-vrp -fno-tree-ccp
-fno-tree-forwprop -fno-tree-pre -fno-tree-fre -msse2
-mmemset-strategy=rep_byte:8192:align,libcall:-1:noalign" } */
+
+#include "pr121934-3a.c"
+
+/* { dg-final { scan-assembler-not "rep stos" } } */
+/* { dg-final { scan-assembler-not "movb\[ \\t\]+\\\$-1" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr121934-4a.c
b/gcc/testsuite/gcc.target/i386/pr121934-4a.c
new file mode 100644
index 00000000000..5aa3e069cff
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr121934-4a.c
@@ -0,0 +1,23 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -fno-tree-dominator-opts -fno-tree-vrp -fno-tree-ccp
-fno-tree-forwprop -fno-tree-pre -fno-tree-fre -mno-avx512f -mavx
-mprefer-vector-width=256" } */
+
+extern int f();
+int a, b, c;
+_BitInt(256) d[3];
+void g() {
+ int h;
+ if (f()) {
+ if (b)
+ i:
+ c > 0;
+ a = 0;
+ for (h = 0; h < 3; h++) {
+ if (a != 1)
+ __builtin_printf("0\n");
+ d[h] = (_BitInt(256)) -1;
+ }
+ goto i;
+ }
+}
+
+/* { dg-final { scan-assembler-not "rep stos" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr121934-4b.c
b/gcc/testsuite/gcc.target/i386/pr121934-4b.c
new file mode 100644
index 00000000000..5f8241dcad5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr121934-4b.c
@@ -0,0 +1,7 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -fno-tree-dominator-opts -fno-tree-vrp -fno-tree-ccp
-fno-tree-forwprop -fno-tree-pre -fno-tree-fre -mno-avx512f -mavx
-mprefer-vector-width=256
-mmemset-strategy=rep_byte:8192:align,libcall:-1:noalign" } */
+
+#include "pr121934-4a.c"
+
+/* { dg-final { scan-assembler-not "rep stos" } } */
+/* { dg-final { scan-assembler-not "movb\[ \\t\]+\\\$-1" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr121934-5a.c
b/gcc/testsuite/gcc.target/i386/pr121934-5a.c
new file mode 100644
index 00000000000..10be0dd4343
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr121934-5a.c
@@ -0,0 +1,23 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -fno-tree-dominator-opts -fno-tree-vrp -fno-tree-ccp
-fno-tree-forwprop -fno-tree-pre -fno-tree-fre -mavx512f
-mprefer-vector-width=512" } */
+
+extern int f();
+int a, b, c;
+_BitInt(512) d[3];
+void g() {
+ int h;
+ if (f()) {
+ if (b)
+ i:
+ c > 0;
+ a = 0;
+ for (h = 0; h < 3; h++) {
+ if (a != 1)
+ __builtin_printf("0\n");
+ d[h] = (_BitInt(512)) -1;
+ }
+ goto i;
+ }
+}
+
+/* { dg-final { scan-assembler-not "rep stos" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr121934-5b.c
b/gcc/testsuite/gcc.target/i386/pr121934-5b.c
new file mode 100644
index 00000000000..6a45a8a7a8b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr121934-5b.c
@@ -0,0 +1,7 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -fno-tree-dominator-opts -fno-tree-vrp -fno-tree-ccp
-fno-tree-forwprop -fno-tree-pre -fno-tree-fre -mavx512f
-mprefer-vector-width=512
-mmemset-strategy=rep_byte:8192:align,libcall:-1:noalign" } */
+
+#include "pr121934-5a.c"
+
+/* { dg-final { scan-assembler-not "rep stos" } } */
+/* { dg-final { scan-assembler-not "movb\[ \\t\]+\\\$-1" } } */
--
2.51.0