Hi,
Please consider this as a reminder to review the ldp and stp peephole
implementation for AArch64 target.
The patch was originally posted at:-
http://gcc.gnu.org/ml/gcc-patches/2013-03/msg01051.html
Please review the same and let me know if its okay.
Build and tested on aarch64-thunder-elf (using Cavium's internal
simulator). No new regressions.
2013-10-29 Naveen H.S <naveen.hurugalaw...@caviumnetworks.com>
gcc/
* config/aarch64/aarch64.md (peephole2s to generate ldp
instruction for 2 consecutive loads from memory): New.
(peephole2s to generate stp instruction for 2 consecutive
stores to memory in integer mode): New.
(peephole2s to generate ldp instruction for 2 consecutive
loads from memory in floating point mode): New.
(peephole2s to generate stp instruction for 2 consecutive
stores to memory in floating point mode): New.
gcc/testsuite
* gcc.target/aarch64/ldp-stp.c: New testcase
Thanks,
Naveen
diff -uprN '-x*.orig' mainline-orig/gcc/config/aarch64/aarch64.md gcc-4.8.0/gcc/config/aarch64/aarch64.md
--- mainline-orig/gcc/config/aarch64/aarch64.md 2013-10-28 17:15:52.363975264 +0530
+++ gcc-4.8.0/gcc/config/aarch64/aarch64.md 2013-10-29 10:11:09.656082201 +0530
@@ -1068,6 +1068,27 @@
(set_attr "mode" "<MODE>")]
)
+(define_peephole2
+ [(set (match_operand:GPI 0 "register_operand")
+ (match_operand:GPI 1 "aarch64_mem_pair_operand"))
+ (set (match_operand:GPI 2 "register_operand")
+ (match_operand:GPI 3 "memory_operand"))]
+ "GET_CODE (operands[1]) == MEM
+ && GET_CODE (XEXP (operands[1], 0)) == PLUS
+ && GET_CODE (XEXP (XEXP (operands[1], 0), 0)) == REG
+ && GET_CODE (XEXP (XEXP (operands[1], 0), 1)) == CONST_INT
+ && GET_MODE (operands[0]) == GET_MODE (XEXP (XEXP (operands[1], 0), 0))
+ && REGNO (operands[0]) != REGNO (operands[2])
+ && REGNO_REG_CLASS (REGNO (operands[0]))
+ == REGNO_REG_CLASS (REGNO (operands[2]))
+ && rtx_equal_p (XEXP (operands[3], 0),
+ plus_constant (Pmode, XEXP (operands[1], 0),
+ GET_MODE_SIZE (<MODE>mode)))
+ && optimize_size"
+ [(parallel [(set (match_dup 0) (match_dup 1))
+ (set (match_dup 2) (match_dup 3))])]
+)
+
;; Operands 0 and 2 are tied together by the final condition; so we allow
;; fairly lax checking on the second memory operation.
(define_insn "store_pair<mode>"
@@ -1085,6 +1106,27 @@
(set_attr "mode" "<MODE>")]
)
+(define_peephole2
+ [(set (match_operand:GPI 0 "aarch64_mem_pair_operand")
+ (match_operand:GPI 1 "register_operand"))
+ (set (match_operand:GPI 2 "memory_operand")
+ (match_operand:GPI 3 "register_operand"))]
+ "GET_CODE (operands[0]) == MEM
+ && GET_CODE (XEXP (operands[0], 0)) == PLUS
+ && GET_CODE (XEXP (XEXP (operands[0], 0), 0)) == REG
+ && GET_CODE (XEXP (XEXP (operands[0], 0), 1)) == CONST_INT
+ && GET_MODE (operands[1]) == GET_MODE (XEXP (XEXP (operands[0], 0), 0))
+ && REGNO (operands[1]) != REGNO (operands[3])
+ && REGNO_REG_CLASS (REGNO (operands[1]))
+ == REGNO_REG_CLASS (REGNO (operands[3]))
+ && rtx_equal_p (XEXP (operands[2], 0),
+ plus_constant (Pmode, XEXP (operands[0], 0),
+ GET_MODE_SIZE (<MODE>mode)))
+ && optimize_size"
+ [(parallel [(set (match_dup 0) (match_dup 1))
+ (set (match_dup 2) (match_dup 3))])]
+)
+
;; Operands 1 and 3 are tied together by the final condition; so we allow
;; fairly lax checking on the second memory operation.
(define_insn "load_pair<mode>"
@@ -1102,6 +1144,28 @@
(set_attr "mode" "<MODE>")]
)
+(define_peephole2
+ [(set (match_operand:GPF 0 "register_operand")
+ (match_operand:GPF 1 "aarch64_mem_pair_operand"))
+ (set (match_operand:GPF 2 "register_operand")
+ (match_operand:GPF 3 "memory_operand"))]
+ "GET_CODE (operands[1]) == MEM
+ && GET_CODE (XEXP (operands[1], 0)) == PLUS
+ && GET_CODE (XEXP (XEXP (operands[1], 0), 0)) == REG
+ && GET_CODE (XEXP (XEXP (operands[1], 0), 1)) == CONST_INT
+ && GET_MODE (operands[0]) == GET_MODE (XEXP (XEXP (operands[1], 0), 0))
+ && REGNO (operands[0]) != REGNO (operands[2])
+ && REGNO (operands[0]) >= 32 && REGNO (operands[2]) >= 32
+ && REGNO_REG_CLASS (REGNO (operands[0]))
+ == REGNO_REG_CLASS (REGNO (operands[2]))
+ && rtx_equal_p (XEXP (operands[3], 0),
+ plus_constant (Pmode, XEXP (operands[1], 0),
+ GET_MODE_SIZE (<MODE>mode)))
+ && optimize_size"
+ [(parallel [(set (match_dup 0) (match_dup 1))
+ (set (match_dup 2) (match_dup 3))])]
+)
+
;; Operands 0 and 2 are tied together by the final condition; so we allow
;; fairly lax checking on the second memory operation.
(define_insn "store_pair<mode>"
@@ -1119,6 +1183,28 @@
(set_attr "mode" "<MODE>")]
)
+(define_peephole2
+ [(set (match_operand:GPF 0 "aarch64_mem_pair_operand")
+ (match_operand:GPF 1 "register_operand"))
+ (set (match_operand:GPF 2 "memory_operand")
+ (match_operand:GPF 3 "register_operand"))]
+ "GET_CODE (operands[0]) == MEM
+ && GET_CODE (XEXP (operands[0], 0)) == PLUS
+ && GET_CODE (XEXP (XEXP (operands[0], 0), 0)) == REG
+ && GET_CODE (XEXP (XEXP (operands[0], 0), 1)) == CONST_INT
+ && GET_MODE (operands[1]) == GET_MODE (XEXP (XEXP (operands[0], 0), 0))
+ && REGNO (operands[1]) != REGNO (operands[3])
+ && REGNO (operands[1]) >= 32 && REGNO (operands[3]) >= 32
+ && REGNO_REG_CLASS (REGNO (operands[1]))
+ == REGNO_REG_CLASS (REGNO (operands[3]))
+ && rtx_equal_p (XEXP (operands[2], 0),
+ plus_constant (Pmode, XEXP (operands[0], 0),
+ GET_MODE_SIZE (<MODE>mode)))
+ && optimize_size"
+ [(parallel [(set (match_dup 0) (match_dup 1))
+ (set (match_dup 2) (match_dup 3))])]
+)
+
;; Load pair with writeback. This is primarily used in function epilogues
;; when restoring [fp,lr]
(define_insn "loadwb_pair<GPI:mode>_<P:mode>"
diff -uprN '-x*.orig' mainline-orig/gcc/testsuite/gcc.target/aarch64/ldp-stp.c gcc-4.8.0/gcc/testsuite/gcc.target/aarch64/ldp-stp.c
--- mainline-orig/gcc/testsuite/gcc.target/aarch64/ldp-stp.c 1970-01-01 05:30:00.000000000 +0530
+++ gcc-4.8.0/gcc/testsuite/gcc.target/aarch64/ldp-stp.c 2013-10-28 19:01:11.695986357 +0530
@@ -0,0 +1,33 @@
+/* { dg-options "-Os" } */
+
+extern void abort (void);
+
+typedef struct
+{
+ long int x, y;
+} ldst;
+
+void
+f (ldst p0, ldst p1, ldst p2, ldst p3, ldst p4, ldst p5)
+{
+ if (p2.x != 1 || p2.y != -1
+ || p3.x != -1 || p3.y != 1 || p4.x != 0 || p4.y != -1)
+ abort ();
+}
+
+void
+foo ()
+{
+ ldst p0, p1, p2, p3, p4, p5;
+
+ p4.x = 0;
+ p4.y = -1;
+
+ p5.x = 1;
+ p5.y = 0;
+
+ f (p0, p1, p2, p3, p4, p5);
+}
+
+/* { dg-final { scan-assembler-times "ldp\tx\[0-9\]+, x\[0-9\]" 3 } } */
+/* { dg-final { scan-assembler-times "stp\tx\[0-9\]+, x\[0-9\]" 3 } } */