Hi, Originally 16-byte memory to memory is expanded via pattern. expand_block_move does an optimization on P8 LE to leverage V2DI reversed load/store for memory to memory move. Now it's done by 16-byte by pieces move and the optimization is lost. This patch adds an insn_and_split pattern to retake the optimization.
Compared to the previous version, the main change is to remove volatile memory operands check from the insn condition as it's no need. Bootstrapped and tested on x86 and powerpc64-linux BE and LE with no regressions. Is this OK for trunk? Thanks Gui Haochen ChangeLog rs6000: Fix regression cases caused 16-byte by pieces move The previous patch enables 16-byte by pieces move. Originally 16-byte move is implemented via pattern. expand_block_move does an optimization on P8 LE to leverage V2DI reversed load/store for memory to memory move. Now 16-byte move is implemented via by pieces move and finally split to two DI load/store. This patch creates an insn_and_split pattern to retake the optimization. gcc/ PR target/111449 * config/rs6000/vsx.md (*vsx_le_mem_to_mem_mov_ti): New. gcc/testsuite/ PR target/111449 * gcc.target/powerpc/pr111449-2.c: New. patch.diff diff --git a/gcc/config/rs6000/vsx.md b/gcc/config/rs6000/vsx.md index f3b40229094..26fa32829af 100644 --- a/gcc/config/rs6000/vsx.md +++ b/gcc/config/rs6000/vsx.md @@ -414,6 +414,27 @@ (define_mode_attr VM3_char [(V2DI "d") ;; VSX moves +;; TImode memory to memory move optimization on LE with p8vector +(define_insn_and_split "*vsx_le_mem_to_mem_mov_ti" + [(set (match_operand:TI 0 "indexed_or_indirect_operand" "=Z") + (match_operand:TI 1 "indexed_or_indirect_operand" "Z"))] + "!BYTES_BIG_ENDIAN + && TARGET_VSX + && !TARGET_P9_VECTOR + && can_create_pseudo_p ()" + "#" + "&& 1" + [(const_int 0)] +{ + rtx tmp = gen_reg_rtx (V2DImode); + rtx src = adjust_address (operands[1], V2DImode, 0); + emit_insn (gen_vsx_ld_elemrev_v2di (tmp, src)); + rtx dest = adjust_address (operands[0], V2DImode, 0); + emit_insn (gen_vsx_st_elemrev_v2di (dest, tmp)); + DONE; +} + [(set_attr "length" "16")]) + ;; The patterns for LE permuted loads and stores come before the general ;; VSX moves so they match first. (define_insn_and_split "*vsx_le_perm_load_<mode>" diff --git a/gcc/testsuite/gcc.target/powerpc/pr111449-2.c b/gcc/testsuite/gcc.target/powerpc/pr111449-2.c new file mode 100644 index 00000000000..7003bdc0208 --- /dev/null +++ b/gcc/testsuite/gcc.target/powerpc/pr111449-2.c @@ -0,0 +1,18 @@ +/* { dg-do compile { target { has_arch_pwr8 } } } */ +/* { dg-require-effective-target powerpc_p8vector_ok } */ +/* { dg-options "-mvsx -O2" } */ + +/* Ensure 16-byte by pieces move is enabled. */ + +void move1 (void *s1, void *s2) +{ + __builtin_memcpy (s1, s2, 16); +} + +void move2 (void *s1) +{ + __builtin_memcpy (s1, "0123456789012345", 16); +} + +/* { dg-final { scan-assembler-times {\mlxvd2x\M|\mp?lxv\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mstxvd2x\M|\mstxv\M} 2 } } */