Hi all,

This patch adds support for generating LDPs and STPs of Q-registers.
This allows for more compact code generation and makes better use of the ISA.

It's implemented in a straightforward way by allowing 16-byte modes in the
sched-fusion machinery and adding appropriate peepholes in aarch64-ldpstp.md
as well as the patterns themselves in aarch64-simd.md.

I didn't see any non-noise performance effect on SPEC2017 on Cortex-A72 and 
Cortex-A53.

Bootstrapped and tested on aarch64-none-linux-gnu.

Ok for trunk?

Thanks,
Kyrill

2018-06-04  Kyrylo Tkachov  <kyrylo.tkac...@arm.com>

    * config/aarch64/aarch64.c (aarch64_mode_valid_for_sched_fusion_p):
    Allow 16-byte modes.
    (aarch64_classify_address): Allow 16-byte modes for load_store_pair_p.
    * config/aarch64/aarch64-ldpstp.md: Add peepholes for LDP STP of
    128-bit modes.
    * config/aarch64/aarch64-simd.md (load_pair<VQ:mode><VQ2:mode>):
    New pattern.
    (vec_store_pair<VQ:mode><VQ2:mode>): Likewise.
    * config/aarch64/iterators.md (VQ2): New mode iterator.

2018-06-04  Kyrylo Tkachov  <kyrylo.tkac...@arm.com>

    * gcc.target/aarch64/ldp_stp_q.c: New test.
    * gcc.target/aarch64/stp_vec_128_1.c: Likewise.
diff --git a/gcc/config/aarch64/aarch64-ldpstp.md b/gcc/config/aarch64/aarch64-ldpstp.md
index 7f1031dc80fab31f691c0b03d6a485c1b6fd7e53..12d89fd2ef5db5e0d3828d75ae244fdc04438f45 100644
--- a/gcc/config/aarch64/aarch64-ldpstp.md
+++ b/gcc/config/aarch64/aarch64-ldpstp.md
@@ -91,6 +91,32 @@ (define_peephole2
   aarch64_swap_ldrstr_operands (operands, false);
 })
 
+(define_peephole2
+  [(set (match_operand:VQ 0 "register_operand" "")
+	(match_operand:VQ 1 "memory_operand" ""))
+   (set (match_operand:VQ2 2 "register_operand" "")
+	(match_operand:VQ2 3 "memory_operand" ""))]
+  "aarch64_operands_ok_for_ldpstp (operands, true, <VQ:MODE>mode)"
+  [(parallel [(set (match_dup 0) (match_dup 1))
+	      (set (match_dup 2) (match_dup 3))])]
+{
+  aarch64_swap_ldrstr_operands (operands, true);
+})
+
+(define_peephole2
+  [(set (match_operand:VQ 0 "memory_operand" "")
+	(match_operand:VQ 1 "register_operand" ""))
+   (set (match_operand:VQ2 2 "memory_operand" "")
+	(match_operand:VQ2 3 "register_operand" ""))]
+  "TARGET_SIMD
+   && aarch64_operands_ok_for_ldpstp (operands, false, <VQ:MODE>mode)"
+  [(parallel [(set (match_dup 0) (match_dup 1))
+	      (set (match_dup 2) (match_dup 3))])]
+{
+  aarch64_swap_ldrstr_operands (operands, false);
+})
+
+
 ;; Handle sign/zero extended consecutive load/store.
 
 (define_peephole2
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index d5803998c60bf9422dbc4481bac1590f4d209a4a..740a3414a8d9c80addbfa611d530d9f56da11100 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -205,6 +205,34 @@ (define_insn "vec_store_pair<DREG:mode><DREG2:mode>"
   [(set_attr "type" "neon_stp")]
 )
 
+(define_insn "load_pair<VQ:mode><VQ2:mode>"
+  [(set (match_operand:VQ 0 "register_operand" "=w")
+	(match_operand:VQ 1 "aarch64_mem_pair_operand" "Ump"))
+   (set (match_operand:VQ2 2 "register_operand" "=w")
+	(match_operand:VQ2 3 "memory_operand" "m"))]
+  "TARGET_SIMD
+    && rtx_equal_p (XEXP (operands[3], 0),
+		    plus_constant (Pmode,
+			       XEXP (operands[1], 0),
+			       GET_MODE_SIZE (<VQ:MODE>mode)))"
+  "ldp\\t%q0, %q2, %1"
+  [(set_attr "type" "neon_ldp_q")]
+)
+
+(define_insn "vec_store_pair<VQ:mode><VQ2:mode>"
+  [(set (match_operand:VQ 0 "aarch64_mem_pair_operand" "=Ump")
+	(match_operand:VQ 1 "register_operand" "w"))
+   (set (match_operand:VQ2 2 "memory_operand" "=m")
+	(match_operand:VQ2 3 "register_operand" "w"))]
+  "TARGET_SIMD && rtx_equal_p (XEXP (operands[2], 0),
+		plus_constant (Pmode,
+			       XEXP (operands[0], 0),
+			       GET_MODE_SIZE (<VQ:MODE>mode)))"
+  "stp\\t%q1, %q3, %0"
+  [(set_attr "type" "neon_stp_q")]
+)
+
+
 (define_split
   [(set (match_operand:VQ 0 "register_operand" "")
       (match_operand:VQ 1 "register_operand" ""))]
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 89fce15e0194365a6c0a85236c3ea6b26d26e89e..77f9f8adef6155e51be7aa5551e71c688128ecfc 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -5681,7 +5681,8 @@ aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
   return mode == SImode || mode == DImode
 	 || mode == SFmode || mode == DFmode
 	 || (aarch64_vector_mode_supported_p (mode)
-	     && known_eq (GET_MODE_SIZE (mode), 8));
+	     && (known_eq (GET_MODE_SIZE (mode), 8)
+		 || known_eq (GET_MODE_SIZE (mode), 16)));
 }
 
 /* Return true if REGNO is a virtual pointer register, or an eliminable
@@ -5838,7 +5839,8 @@ aarch64_classify_address (struct aarch64_address_info *info,
 
 	  if (load_store_pair_p)
 	    return ((known_eq (GET_MODE_SIZE (mode), 4)
-		     || known_eq (GET_MODE_SIZE (mode), 8))
+		     || known_eq (GET_MODE_SIZE (mode), 8)
+		     || known_eq (GET_MODE_SIZE (mode), 16))
 		    && aarch64_offset_7bit_signed_scaled_p (mode, offset));
 	  else
 	    return (offset_9bit_signed_unscaled_p (mode, offset)
@@ -5898,7 +5900,8 @@ aarch64_classify_address (struct aarch64_address_info *info,
 
 	  if (load_store_pair_p)
 	    return ((known_eq (GET_MODE_SIZE (mode), 4)
-		     || known_eq (GET_MODE_SIZE (mode), 8))
+		     || known_eq (GET_MODE_SIZE (mode), 8)
+		     || known_eq (GET_MODE_SIZE (mode), 16))
 		    && aarch64_offset_7bit_signed_scaled_p (mode, offset));
 	  else
 	    return offset_9bit_signed_unscaled_p (mode, offset);
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index d8b78ef4c474a58071e97eced1fe95ca4e033910..9146414c335dfe16b40dcb6a4233612ae43e2926 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -84,6 +84,9 @@ (define_mode_iterator VDQ_BHSI [V8QI V16QI V4HI V8HI V2SI V4SI])
 ;; Quad vector modes.
 (define_mode_iterator VQ [V16QI V8HI V4SI V2DI V8HF V4SF V2DF])
 
+;; Copy of the above.
+(define_mode_iterator VQ2 [V16QI V8HI V4SI V2DI V8HF V4SF V2DF])
+
 ;; Quad integer vector modes.
 (define_mode_iterator VQ_I [V16QI V8HI V4SI V2DI])
 
diff --git a/gcc/testsuite/gcc.target/aarch64/ldp_stp_q.c b/gcc/testsuite/gcc.target/aarch64/ldp_stp_q.c
new file mode 100644
index 0000000000000000000000000000000000000000..e5f04b7ad678627e08e791c05c50ae6dcf081088
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/ldp_stp_q.c
@@ -0,0 +1,26 @@
+/* { dg-options "-O2 -mcpu=generic" } */
+
+typedef float float32x4_t __attribute__ ((__vector_size__ ((16))));
+
+float32x4_t arr[4][4];
+
+void
+foo (float32x4_t x, float32x4_t y)
+{
+  arr[0][1] = x;
+  arr[1][0] = y;
+  arr[2][0] = x;
+  arr[1][1] = y;
+  arr[0][2] = x;
+  arr[0][3] = y;
+  arr[1][2] = x;
+  arr[2][1] = y;
+  arr[3][0] = x;
+  arr[3][1] = y;
+  arr[2][2] = x;
+  arr[1][3] = y;
+  arr[2][3] = x;
+  arr[3][2] = y;
+}
+
+/* { dg-final { scan-assembler-times "stp\tq\[0-9\]+, q\[0-9\]" 7 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/stp_vec_128_1.c b/gcc/testsuite/gcc.target/aarch64/stp_vec_128_1.c
new file mode 100644
index 0000000000000000000000000000000000000000..07d98d2f36637dfa244fb0e217c39db7c2ee3913
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/stp_vec_128_1.c
@@ -0,0 +1,20 @@
+/* { dg-do compile } */
+/* { dg-options "-Ofast" } */
+
+
+typedef int int32x4_t __attribute__ ((__vector_size__ ((16))));
+
+void
+bar (int32x4_t *foo)
+{
+  int i = 0;
+  int32x4_t val = { 3, 2, 5, 1 };
+
+  for (i = 0; i < 256; i+=2)
+    {
+      foo[i] = val;
+      foo[i+1] = val;
+    }
+}
+
+/* { dg-final { scan-assembler "stp\tq\[0-9\]+, q\[0-9\]" } } */

Reply via email to