From 42cc8fea142bba86262787a819d002e5df462893 Mon Sep 17 00:00:00 2001
From: Fei Yang <felix.yang@huawei.com>
Date: Tue, 2 Jun 2020 09:02:41 +0800
Subject: [PATCH] expand: Simplify removing subregs when expanding a copy
 [PR95254]

In rtl expand, if we have a copy that matches one of the following patterns:
  (set (subreg:M1 (reg:M2 ...)) (subreg:M1 (reg:M2 ...)))
  (set (subreg:M1 (reg:M2 ...)) (mem:M1 ADDR))
  (set (mem:M1 ADDR) (subreg:M1 (reg:M2 ...)))
  (set (subreg:M1 (reg:M2 ...)) (constant C))
where mode M1 is equal in size to M2, try to detect whether the mode change
involves an implicit round trip through memory.  If so, see if we can avoid
that by removing the subregs and doing the move in mode M2 instead.

2020-06-02  Felix Yang  <felix.yang@huawei.com>

gcc/
        PR target/95254
        * expr.c (emit_move_insn): Check src and dest of the copy to see
        if one or both of them are subregs, try to remove the subregs when
        innermode and outermode are equal in size and the mode change involves
        an implicit round trip through memory.

gcc/testsuite/
        PR target/95254
        * gcc.target/aarch64/pr95254.c: New test.
        * gcc.target/i386/pr67609.c: Check "movq\t%xmm0" instead of "movdqa".
---
 gcc/expr.c                                 | 74 ++++++++++++++++++++++
 gcc/testsuite/gcc.target/aarch64/pr95254.c | 19 ++++++
 gcc/testsuite/gcc.target/i386/pr67609.c    |  2 +-
 3 files changed, 94 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/pr95254.c

diff --git a/gcc/expr.c b/gcc/expr.c
index 6b75028e7f1..ca6b1c1291e 100644
--- a/gcc/expr.c
+++ b/gcc/expr.c
@@ -3814,6 +3814,80 @@ emit_move_insn (rtx x, rtx y)
   gcc_assert (mode != BLKmode
 	      && (GET_MODE (y) == mode || GET_MODE (y) == VOIDmode));
 
+  /* If we have a copy that looks like one of the following patterns:
+       (set (subreg:M1 (reg:M2 ...)) (subreg:M1 (reg:M2 ...)))
+       (set (subreg:M1 (reg:M2 ...)) (mem:M1 ADDR))
+       (set (mem:M1 ADDR) (subreg:M1 (reg:M2 ...)))
+       (set (subreg:M1 (reg:M2 ...)) (constant C))
+     where mode M1 is equal in size to M2, try to detect whether the
+     mode change involves an implicit round trip through memory.
+     If so, see if we can avoid that by removing the subregs and
+     doing the move in mode M2 instead.  */
+
+  rtx x_inner = NULL_RTX;
+  rtx y_inner = NULL_RTX;
+
+  auto candidate_subreg_p = [&](rtx subreg) {
+    return (REG_P (SUBREG_REG (subreg))
+	    && known_eq (GET_MODE_SIZE (GET_MODE (SUBREG_REG (subreg))),
+			 GET_MODE_SIZE (GET_MODE (subreg)))
+	    && optab_handler (mov_optab, GET_MODE (SUBREG_REG (subreg)))
+	       != CODE_FOR_nothing);
+  };
+
+  auto candidate_mem_p = [&](machine_mode innermode, rtx mem) {
+    return (!targetm.can_change_mode_class (innermode, GET_MODE (mem), ALL_REGS)
+	    && !push_operand (mem, GET_MODE (mem))
+	    /* Not a candiate if innermode requires too much alignment.  */
+	    && (MEM_ALIGN (mem) >= GET_MODE_ALIGNMENT (innermode)
+		|| targetm.slow_unaligned_access (GET_MODE (mem),
+						  MEM_ALIGN (mem))
+		|| !targetm.slow_unaligned_access (innermode,
+						   MEM_ALIGN (mem))));
+  };
+
+  if (SUBREG_P (x) && candidate_subreg_p (x))
+    x_inner = SUBREG_REG (x);
+
+  if (SUBREG_P (y) && candidate_subreg_p (y))
+    y_inner = SUBREG_REG (y);
+
+  if (x_inner != NULL_RTX
+      && y_inner != NULL_RTX
+      && GET_MODE (x_inner) == GET_MODE (y_inner)
+      && !targetm.can_change_mode_class (GET_MODE (x_inner), mode, ALL_REGS))
+    {
+      x = x_inner;
+      y = y_inner;
+      mode = GET_MODE (x_inner);
+    }
+  else if (x_inner != NULL_RTX
+	   && MEM_P (y)
+	   && candidate_mem_p (GET_MODE (x_inner), y))
+    {
+      x = x_inner;
+      y = adjust_address (y, GET_MODE (x_inner), 0);
+      mode = GET_MODE (x_inner);
+    }
+  else if (y_inner != NULL_RTX
+	   && MEM_P (x)
+	   && candidate_mem_p (GET_MODE (y_inner), x))
+    {
+      x = adjust_address (x, GET_MODE (y_inner), 0);
+      y = y_inner;
+      mode = GET_MODE (y_inner);
+    }
+  else if (x_inner != NULL_RTX
+	   && CONSTANT_P (y)
+	   && !targetm.can_change_mode_class (GET_MODE (x_inner),
+					      mode, ALL_REGS)
+	   && (y_inner = simplify_subreg (GET_MODE (x_inner), y, mode, 0)))
+    {
+      x = x_inner;
+      y = y_inner;
+      mode = GET_MODE (x_inner);
+    }
+
   if (CONSTANT_P (y))
     {
       if (optimize
diff --git a/gcc/testsuite/gcc.target/aarch64/pr95254.c b/gcc/testsuite/gcc.target/aarch64/pr95254.c
new file mode 100644
index 00000000000..10bfc868197
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr95254.c
@@ -0,0 +1,19 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-slp-vectorize -march=armv8.2-a+sve -msve-vector-bits=256" } */
+
+typedef short __attribute__((vector_size (8))) v4hi;
+
+typedef union U4HI { v4hi v; short a[4]; } u4hi;
+
+short b[4];
+
+void pass_v4hi (v4hi v)
+{
+    int i;
+    u4hi u;
+    u.v = v;
+    for (i = 0; i < 4; i++)
+      b[i] = u.a[i];
+};
+
+/* { dg-final { scan-assembler-not "ptrue" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr67609.c b/gcc/testsuite/gcc.target/i386/pr67609.c
index 518071bdd86..398cdba5d5f 100644
--- a/gcc/testsuite/gcc.target/i386/pr67609.c
+++ b/gcc/testsuite/gcc.target/i386/pr67609.c
@@ -1,7 +1,7 @@
 /* { dg-do compile } */
 /* { dg-options "-O2 -msse2" } */
 /* { dg-require-effective-target lp64 } */
-/* { dg-final { scan-assembler "movdqa" } } */
+/* { dg-final { scan-assembler "movq\t%xmm0" } } */
 
 #include <emmintrin.h>
 __m128d reg;
-- 
2.19.1

