This patch is the main middle-end piece of a fix for PR tree-opt/98335,
which is a code-quality regression affecting mainline.  The issue occurs
in DSE's (dead store elimination's) compute_trims function that determines
where a store to memory can be trimmed.  In the testcase given in the
PR, this function notices that the first byte of a DImode store is dead,
and replaces the 8-byte store at (aligned) offset zero, with a 7-byte store
at (unaligned) offset one.  Most architectures can store a power-of-two
bytes (up to a maximum) in single instruction, so writing 7 bytes requires
more instructions than writing 8 bytes.  This patch follows Jakub Jelinek's
suggestion in comment 5, that compute_trims needs improved heuristics.

In this patch, decision of whether and how to align trim_head is based
on the number of bytes being written, the alignment of the start of the
object and where within the object the first byte is written.  The first
tests check whether we're already writing to the start of the object,
and that we're writing three or more bytes.  If we're only writing one
or two bytes, there's no benefit from providing additional alignment.
Then we determine the alignment of the object, which is either 1, 2,
4, 8 or 16 byte aligned (capping at 16 guarantees that we never write
more than 7 bytes beyond the minimum required).  If the buffer is only
1 or 2 byte aligned there's no benefit from additional alignment.  For
the remaining cases, alignment of trim_head is based upon where within
each aligned block (word) the first byte is written.  For example,
storing the last byte (or last half-word) of a word can be performed
with a single insn.

On x86_64-pc-linux-gnu with -O2 the new test case in the PR goes from:

        movl    $0, -24(%rsp)
        movabsq $72057594037927935, %rdx
        movl    $0, -21(%rsp)
        andq    -24(%rsp), %rdx
        movq    %rdx, %rax
        salq    $8, %rax
        movb    c(%rip), %al
        ret

to

        xorl    %eax, %eax
        movb    c(%rip), %al
        ret

This patch has been tested on x86_64-pc-linux-gnu with make bootstrap
and make -k check with no new failures.  I've also added new testcases
for the original motivating PR tree-optimization/86010, to ensure that
those remain optimized (in future).  Ok for mainline?

2022-03-07  Roger Sayle  <ro...@nextmovesoftware.com>

gcc/ChangeLog
        PR tree-optimization/98335
        * tree-ssa-dse.cc (compute_trims): Improve logic deciding whether
        to align trim_head, writing more bytes by using fewer instructions.

gcc/testsuite/ChangeLog
        PR tree-optimization/98335
        * g++.dg/pr98335.C: New test case.
        * gcc.dg/pr86010.c: New test case.
        * gcc.dg/pr86010-2.c: New test case.

Thanks in advance,
Roger
--

diff --git a/gcc/tree-ssa-dse.cc b/gcc/tree-ssa-dse.cc
index 2b22a61..080e406 100644
--- a/gcc/tree-ssa-dse.cc
+++ b/gcc/tree-ssa-dse.cc
@@ -405,10 +405,36 @@ compute_trims (ao_ref *ref, sbitmap live, int *trim_head, 
int *trim_tail,
   int first_live = bitmap_first_set_bit (live);
   *trim_head = first_live - first_orig;
 
-  /* If more than a word remains, then make sure to keep the
-     starting point at least word aligned.  */
-  if (last_live - first_live > UNITS_PER_WORD)
-    *trim_head &= ~(UNITS_PER_WORD - 1);
+  /* If REF is aligned, try to maintain this alignment if it reduces
+     the number of (power-of-two sized aligned) writes to memory.
+     First check that we're writing >= 3 bytes at a non-zero offset.  */
+  if (first_live
+      && last_live - first_live >= 2)
+    {
+      unsigned int align = TYPE_ALIGN_UNIT (TREE_TYPE (ref->base));
+      if (DECL_P (ref->base) && DECL_ALIGN_UNIT (ref->base) > align)
+       align = DECL_ALIGN_UNIT (ref->base);
+      if (align > UNITS_PER_WORD)
+       align = UNITS_PER_WORD;
+      if (align > 16)
+       align = 16;
+      if (align > 2)
+       {
+         /* ALIGN is 4, 8 or 16.  */
+         unsigned int low = first_live & (align - 1);
+         if (low * 2 < align)
+           {
+             if (align == 16 && low >= 4 && last_live < 15)
+               *trim_head &= ~3;
+             else
+               *trim_head &= ~(align - 1);
+           }
+         else if (low + 3 == align)
+           *trim_head &= ~1;
+         else if (low > 8 && low < 12)
+           *trim_head &= ~3;
+       }
+    }
 
   if ((*trim_head || *trim_tail)
       && dump_file && (dump_flags & TDF_DETAILS))
diff --git a/gcc/testsuite/g++.dg/pr98335.C b/gcc/testsuite/g++.dg/pr98335.C
new file mode 100644
index 0000000..c54f4d9
--- /dev/null
+++ b/gcc/testsuite/g++.dg/pr98335.C
@@ -0,0 +1,15 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-optimized" } */
+
+struct Data {
+  char a;
+  int b;
+};
+
+char c;
+
+Data val(int idx) {
+  return { c };  // { dg-warning "extended initializer" "c++ 98"  { target { 
c++98_only } } }
+}
+
+/* { dg-final { scan-tree-dump-not " + 1B] = {}" "optimized" } } */
diff --git a/gcc/testsuite/gcc.dg/pr86010-2.c b/gcc/testsuite/gcc.dg/pr86010-2.c
new file mode 100644
index 0000000..4c82e65
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/pr86010-2.c
@@ -0,0 +1,22 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-optimized" } */
+
+void f (void*);
+
+void g (char *a)
+{
+  __builtin_memset (a, 0, 8);
+  __builtin_memset (a, 0, 8);
+
+  f (a);
+}
+
+void h (char *a)
+{
+  __builtin_memset (a, 0, 8);
+  __builtin_memset (a, 0, 7);
+
+  f (a);
+}
+
+/* { dg-final { scan-tree-dump-times "__builtin_memset" 2 "optimized" } } */
diff --git a/gcc/testsuite/gcc.dg/pr86010.c b/gcc/testsuite/gcc.dg/pr86010.c
new file mode 100644
index 0000000..ac27989
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/pr86010.c
@@ -0,0 +1,24 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-optimized" } */
+
+void f (void*);
+
+void g (void)
+{
+  char a[8];
+  __builtin_memset (a, 0, 8);
+  __builtin_memset (a, 0, 8);
+
+  f (a);
+}
+
+void h (void)
+{
+  char a[8];
+  __builtin_memset (a, 0, 8);
+  __builtin_memset (a, 0, 7);
+
+  f (a);
+}
+
+/* { dg-final { scan-tree-dump-times "__builtin_memset" 2 "optimized" } } */

Reply via email to