Hi,
  This patch enables overlap memory store for block memory clear which
saves the number of store instructions. The expander calls
widest_fixed_size_mode_for_block_clear to get the mode for looped block
clear and calls widest_fixed_size_mode_for_block_clear to get the mode
for last overlapped clear.

    Bootstrapped and tested on x86 and powerpc64-linux BE and LE with no
regressions. Is it OK for the trunk or next stage 1?

Thanks
Gui Haochen


ChangeLog
rs6000: Enable overlap memory store for block memory clear

gcc/
        * config/rs6000/rs6000-string.cc
        (widest_fixed_size_mode_for_block_clear): New.
        (smallest_fixed_size_mode_for_block_clear): New.
        (expand_block_clear): Call widest_fixed_size_mode_for_block_clear to
        get the mode for looped memory stores and call
        smallest_fixed_size_mode_for_block_clear to get the mode for the last
        overlapped memory store.

gcc/testsuite
        * gcc.target/powerpc/block-clear-1.c: New.


patch.diff
diff --git a/gcc/config/rs6000/rs6000-string.cc 
b/gcc/config/rs6000/rs6000-string.cc
index 133e5382af2..c2a6095a586 100644
--- a/gcc/config/rs6000/rs6000-string.cc
+++ b/gcc/config/rs6000/rs6000-string.cc
@@ -38,6 +38,49 @@
 #include "profile-count.h"
 #include "predict.h"

+/* Return the widest mode which mode size is less than or equal to the
+   size.  */
+static fixed_size_mode
+widest_fixed_size_mode_for_block_clear (unsigned int size, unsigned int align,
+                                       bool unaligned_vsx_ok)
+{
+  machine_mode mode;
+
+  if (TARGET_ALTIVEC
+      && size >= 16
+      && (align >= 128
+         || unaligned_vsx_ok))
+    mode = V4SImode;
+  else if (size >= 8
+          && TARGET_POWERPC64
+          && (align >= 64
+              || !STRICT_ALIGNMENT))
+    mode = DImode;
+  else if (size >= 4
+          && (align >= 32
+              || !STRICT_ALIGNMENT))
+    mode = SImode;
+  else if (size >= 2
+          && (align >= 16
+              || !STRICT_ALIGNMENT))
+    mode = HImode;
+  else
+    mode = QImode;
+
+  return as_a <fixed_size_mode> (mode);
+}
+
+/* Return the smallest mode which mode size is smaller than or eqaul to
+   the size.  */
+static fixed_size_mode
+smallest_fixed_size_mode_for_block_clear (unsigned int size)
+{
+  if (size > UNITS_PER_WORD)
+    return as_a <fixed_size_mode> (V4SImode);
+
+  return smallest_int_mode_for_size (size * BITS_PER_UNIT);
+}
+
 /* Expand a block clear operation, and return 1 if successful.  Return 0
    if we should let the compiler generate normal code.

@@ -55,7 +98,6 @@ expand_block_clear (rtx operands[])
   HOST_WIDE_INT align;
   HOST_WIDE_INT bytes;
   int offset;
-  int clear_bytes;
   int clear_step;

   /* If this is not a fixed size move, just call memcpy */
@@ -89,62 +131,36 @@ expand_block_clear (rtx operands[])

   bool unaligned_vsx_ok = (bytes >= 32 && TARGET_EFFICIENT_UNALIGNED_VSX);

-  for (offset = 0; bytes > 0; offset += clear_bytes, bytes -= clear_bytes)
+  auto mode = widest_fixed_size_mode_for_block_clear (bytes, align,
+                                                     unaligned_vsx_ok);
+  offset = 0;
+  rtx dest;
+
+  do
     {
-      machine_mode mode = BLKmode;
-      rtx dest;
+      unsigned int size = GET_MODE_SIZE (mode);

-      if (TARGET_ALTIVEC
-         && (bytes >= 16 && (align >= 128 || unaligned_vsx_ok)))
+      while (bytes >= size)
        {
-         clear_bytes = 16;
-         mode = V4SImode;
-       }
-      else if (bytes >= 8 && TARGET_POWERPC64
-              && (align >= 64 || !STRICT_ALIGNMENT))
-       {
-         clear_bytes = 8;
-         mode = DImode;
-         if (offset == 0 && align < 64)
-           {
-             rtx addr;
+         dest = adjust_address (orig_dest, mode, offset);
+         emit_move_insn (dest, CONST0_RTX (mode));

-             /* If the address form is reg+offset with offset not a
-                multiple of four, reload into reg indirect form here
-                rather than waiting for reload.  This way we get one
-                reload, not one per store.  */
-             addr = XEXP (orig_dest, 0);
-             if ((GET_CODE (addr) == PLUS || GET_CODE (addr) == LO_SUM)
-                 && CONST_INT_P (XEXP (addr, 1))
-                 && (INTVAL (XEXP (addr, 1)) & 3) != 0)
-               {
-                 addr = copy_addr_to_reg (addr);
-                 orig_dest = replace_equiv_address (orig_dest, addr);
-               }
-           }
-       }
-      else if (bytes >= 4 && (align >= 32 || !STRICT_ALIGNMENT))
-       {                       /* move 4 bytes */
-         clear_bytes = 4;
-         mode = SImode;
-       }
-      else if (bytes >= 2 && (align >= 16 || !STRICT_ALIGNMENT))
-       {                       /* move 2 bytes */
-         clear_bytes = 2;
-         mode = HImode;
-       }
-      else /* move 1 byte at a time */
-       {
-         clear_bytes = 1;
-         mode = QImode;
+         offset += size;
+         bytes -= size;
        }

-      dest = adjust_address (orig_dest, mode, offset);
+      if (bytes == 0)
+       return 1;

-      emit_move_insn (dest, CONST0_RTX (mode));
+      mode = smallest_fixed_size_mode_for_block_clear (bytes);
+      int gap = GET_MODE_SIZE (mode) - bytes;
+      if (gap > 0)
+       {
+         offset -= gap;
+         bytes += gap;
+       }
     }
-
-  return 1;
+  while (1);
 }

 /* Figure out the correct instructions to generate to load data for
diff --git a/gcc/testsuite/gcc.target/powerpc/block-clear-1.c 
b/gcc/testsuite/gcc.target/powerpc/block-clear-1.c
new file mode 100644
index 00000000000..5e16c44fea3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/block-clear-1.c
@@ -0,0 +1,9 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+/* { dg-final { scan-assembler-not {\mst[hb]\M} } } */
+
+/* Verify that memclear takes overlap store. */
+void* foo (char* s1)
+{
+  __builtin_memset (s1, 0, 31);
+}

Reply via email to