https://gcc.gnu.org/g:a1c592be501b12942181391cb6a7e3cca54e4e45
commit r16-1717-ga1c592be501b12942181391cb6a7e3cca54e4e45 Author: Dimitar Dimitrov <dimi...@dinux.eu> Date: Sun Feb 9 17:55:03 2025 +0200 pru: Split 64-bit moves into a sequence of 32-bit moves The 64-bit register-to-register moves on PRU are implemented with two instructions moving 32-bit registers. Defining a split for the 64-bit moves allows this to be described in RTL, and thus one of the 32-bit moves to be eliminated if the destination register is dead. Also, split the loading of non-trivial 64-bit integer constants. The resulting 32-bit integer constants have better chance to be loaded with something more optimal than an "ldi32". For now do the splits only after register allocation, because LRA does not yet efficiently handle subregs. See https://gcc.gnu.org/pipermail/gcc-patches/2024-May/651366.html This patch shows slight improvement for wikisort benchmark from embench-iot: Benchmark size-before size-after difference --------- ----------- ---------- ---------- aha-mont64 1,648 1,648 0 crc32 104 104 0 depthconv 1,172 1,172 0 edn 3,040 3,040 0 huffbench 1,616 1,616 0 matmult-int 748 748 0 md5sum 700 700 0 nettle-aes 2,664 2,664 0 nettle-sha256 5,732 5,732 0 nsichneu 21,372 21,372 0 picojpeg 9,716 9,716 0 qrduino 8,556 8,556 0 sglib-combined 3,724 3,724 0 slre 3,488 3,488 0 statemate 1,132 1,132 0 tarfind 652 652 0 ud 1,004 1,004 0 wikisort 18,120 18,092 -28 xgboost 300 300 0 gcc/ChangeLog: * config/pru/pru.md (reg move splitter): New splitter for 64-bit register moves into two 32-bit moves. (const_int move splitter): New splitter for 64-bit constant integer moves into two 32-bit moves. gcc/testsuite/ChangeLog: * gcc.target/pru/mov64-subreg-1.c: New test. * gcc.target/pru/mov64-subreg-2.c: New test. Signed-off-by: Dimitar Dimitrov <dimi...@dinux.eu> Diff: --- gcc/config/pru/pru.md | 77 +++++++++++++++++++++++++++ gcc/testsuite/gcc.target/pru/mov64-subreg-1.c | 9 ++++ gcc/testsuite/gcc.target/pru/mov64-subreg-2.c | 8 +++ 3 files changed, 94 insertions(+) diff --git a/gcc/config/pru/pru.md b/gcc/config/pru/pru.md index fcd310613f50..3504e42e9002 100644 --- a/gcc/config/pru/pru.md +++ b/gcc/config/pru/pru.md @@ -283,6 +283,83 @@ [(set_attr "type" "st,ld,alu,alu,alu,alu,alu,alu") (set_attr "length" "4,4,4,4,8,8,8,16")]) +; Break 64-bit register-to-register moves into 32-bit moves. +; If only a subreg of the destination is used, this split would allow +; for the other 32-bit subreg of the DI register to be eliminated. +(define_split + [(set (match_operand:DI 0 "register_operand") + (match_operand:DI 1 "register_operand"))] + " + /* TODO - LRA does not yet handle subregs efficiently. + So it is profitable to split only after register allocation is + complete. + Once https://gcc.gnu.org/pipermail/gcc-patches/2024-May/651366.html + is merged, this condition should be removed to allow splitting + before LRA. */ + reload_completed + /* Sign-extended paradoxical registers require expansion + of the proper pattern. We can do only zero extension here. */ + && (SUBREG_P (operands[1]) && paradoxical_subreg_p (operands[1]) + ? SUBREG_PROMOTED_VAR_P (operands[1]) + && SUBREG_PROMOTED_UNSIGNED_P (operands[1]) > 0 + : true)" + [(set (match_dup 0) (match_dup 1)) + (set (match_dup 2) (match_dup 3))] + " + rtx dst_lo = simplify_gen_subreg (SImode, operands[0], DImode, 0); + rtx dst_hi = simplify_gen_subreg (SImode, operands[0], DImode, 4); + rtx src_lo = simplify_gen_subreg (SImode, operands[1], DImode, 0); + rtx src_hi = simplify_gen_subreg (SImode, operands[1], DImode, 4); + + if (SUBREG_P (operands[1]) && paradoxical_subreg_p (operands[1])) + { + gcc_assert (SUBREG_PROMOTED_VAR_P (operands[1])); + gcc_assert (SUBREG_PROMOTED_UNSIGNED_P (operands[1]) > 0); + + operands[0] = dst_lo; + operands[1] = src_lo; + operands[2] = dst_hi; + operands[3] = const0_rtx; + } + else if (!reg_overlap_mentioned_p (dst_lo, src_hi)) + { + operands[0] = dst_lo; + operands[1] = src_lo; + operands[2] = dst_hi; + operands[3] = src_hi; + } + else + { + operands[0] = dst_hi; + operands[1] = src_hi; + operands[2] = dst_lo; + operands[3] = src_lo; + } + " +) + +; Break loading of non-trivial 64-bit constant integers. The split +; will not generate better code sequence, but at least would allow +; dropping a non-live 32-bit part of the destination, or better +; constant propagation. +(define_split + [(set (match_operand:DI 0 "register_operand") + (match_operand:DI 1 "const_int_operand"))] + "reload_completed + && !satisfies_constraint_Z (operands[1]) + && !satisfies_constraint_Um (operands[1]) + && !satisfies_constraint_T (operands[1])" + + [(set (match_dup 0) (match_dup 1)) + (set (match_dup 2) (match_dup 3))] + " + operands[2] = simplify_gen_subreg (SImode, operands[0], DImode, 4); + operands[3] = simplify_gen_subreg (SImode, operands[1], DImode, 4);; + operands[0] = simplify_gen_subreg (SImode, operands[0], DImode, 0); + operands[1] = simplify_gen_subreg (SImode, operands[1], DImode, 0); + " +) + ; ; load_multiple pattern(s). ; diff --git a/gcc/testsuite/gcc.target/pru/mov64-subreg-1.c b/gcc/testsuite/gcc.target/pru/mov64-subreg-1.c new file mode 100644 index 000000000000..9b60aa033f15 --- /dev/null +++ b/gcc/testsuite/gcc.target/pru/mov64-subreg-1.c @@ -0,0 +1,9 @@ +/* { dg-do assemble } */ +/* { dg-options "-Os" } */ +/* { dg-final { object-size text == 8 } } */ + + +unsigned test(char a, unsigned long long b) +{ + return b; +} diff --git a/gcc/testsuite/gcc.target/pru/mov64-subreg-2.c b/gcc/testsuite/gcc.target/pru/mov64-subreg-2.c new file mode 100644 index 000000000000..146cf9456087 --- /dev/null +++ b/gcc/testsuite/gcc.target/pru/mov64-subreg-2.c @@ -0,0 +1,8 @@ +/* { dg-do assemble } */ +/* { dg-options "-Os" } */ +/* { dg-final { object-size text == 12 } } */ + +unsigned long long test(void) +{ + return 0xffffffff00000000UL; +}