Hi, on 2023/11/6 17:47, HAO CHEN GUI wrote: > Hi, > The patch 2 enables 16-byte by pieces move on rs6000. This patch fixes > the regression cases caused by previous patch. For sra-17/18, the long > array with 4 elements can be loaded by one 16-byte by pieces move on 32-bit > platform. So the array is not be constructed in LC0 and SRA optimization > is unable to be taken. "no-vsx" option is added for 32-bit platform, as > it sets the MOVE_MAX_PIECES to 4-byte on 32-bit platform and the array > can't be loaded by one by pieces move.
I'd expect the adjustments on these two affected sra-17/18 test cases are moved to the previous related patch, as that patch caused them to fail and your new define_insn_and_split doesn't actually affect them? > > Another regression is on P8 LE. The 16-byte memory to memory is > implemented by two TImode load/store. The TImode load/store is finally > split to two DImode load/store on P8 LE as it doesn't have unaligned > vector load/store instructions. Actually, 16-byte memory to memory move > can be implement by two V2DI reversed load/store on P8 LE. The patch > creates a insn_and_split pattern for this optimization. > > Compared to previous version, it fixes the syntax errors in test cases. > > Bootstrapped and tested on x86 and powerpc64-linux BE and LE with no > regressions. Is this OK for trunk? > > Thanks > Gui Haochen > > ChangeLog > rs6000: Enable 16-byte by pieces move > > This patch enables 16-byte by pieces move. The 16-byte move is generated > with TImode and finally implemented by vector instructions. There are > several regression cases after the enablement. 16-byte TImode memory to > memory move is originally implemented by two pairs of DImode load/store on > P8 LE as there is no unaligned vsx load/store on it. The patch fixes > the problem by creating an insn_and_split pattern and converts it to one > pair of reversed load/store. Two SRA cases lost the SRA optimization as > the array can be loaded by one 16-byte move so that not be initialized in > LC0 on 32-bit platform. So fixes them by adding no-vsx option. > > gcc/ > PR target/111449 > * config/rs6000/vsx.md (*vsx_le_mem_to_mem_mov_ti): New. > > gcc/testsuite/ > PR target/111449 > * gcc.dg/tree-ssa/sra-17.c: Add no-vsx option for powerpc ilp32. > * gcc.dg/tree-ssa/sra-18.c: Likewise. > * gcc.target/powerpc/pr111449-1.c: New. Wrong test case name. > > > patch.diff > diff --git a/gcc/config/rs6000/vsx.md b/gcc/config/rs6000/vsx.md > index f3b40229094..9f6bc49998a 100644 > --- a/gcc/config/rs6000/vsx.md > +++ b/gcc/config/rs6000/vsx.md > @@ -414,6 +414,27 @@ (define_mode_attr VM3_char [(V2DI "d") > > ;; VSX moves > > +;; TImode memory to memory move optimization on LE with p8vector > +(define_insn_and_split "*vsx_le_mem_to_mem_mov_ti" > + [(set (match_operand:TI 0 "indexed_or_indirect_operand" "=Z") > + (match_operand:TI 1 "indexed_or_indirect_operand" "Z"))] > + "!BYTES_BIG_ENDIAN && TARGET_VSX && !TARGET_P9_VECTOR Nit: Make each one in its separated line since the below ones are separated. > + && !MEM_VOLATILE_P (operands[0]) > + && !MEM_VOLATILE_P (operands[1]) > + && !reload_completed" !reload_completed is for required pseudo? It would be better with can_create_pseudo_p (). BR, Kewen > + "#" > + "&& 1" > + [(const_int 0)] > +{ > + rtx tmp = gen_reg_rtx (V2DImode); > + rtx src = adjust_address (operands[1], V2DImode, 0); > + emit_insn (gen_vsx_ld_elemrev_v2di (tmp, src)); > + rtx dest = adjust_address (operands[0], V2DImode, 0); > + emit_insn (gen_vsx_st_elemrev_v2di (dest, tmp)); > + DONE; > +} > + [(set_attr "length" "16")]) > + > ;; The patterns for LE permuted loads and stores come before the general > ;; VSX moves so they match first. > (define_insn_and_split "*vsx_le_perm_load_<mode>" > diff --git a/gcc/testsuite/gcc.dg/tree-ssa/sra-17.c > b/gcc/testsuite/gcc.dg/tree-ssa/sra-17.c > index 221d96b6cd9..b0d4811e77b 100644 > --- a/gcc/testsuite/gcc.dg/tree-ssa/sra-17.c > +++ b/gcc/testsuite/gcc.dg/tree-ssa/sra-17.c > @@ -1,6 +1,7 @@ > /* { dg-do run { target { aarch64*-*-* alpha*-*-* arm*-*-* hppa*-*-* > powerpc*-*-* s390*-*-* } } } */ > /* { dg-options "-O2 -fdump-tree-esra --param > sra-max-scalarization-size-Ospeed=32" } */ > /* { dg-additional-options "-mcpu=ev4" { target alpha*-*-* } } */ > +/* { dg-additional-options "-mno-vsx" { target { powerpc*-*-* && ilp32 } } } > */ > > extern void abort (void); > > diff --git a/gcc/testsuite/gcc.dg/tree-ssa/sra-18.c > b/gcc/testsuite/gcc.dg/tree-ssa/sra-18.c > index f5e6a21c2ae..2cdeae6e9e7 100644 > --- a/gcc/testsuite/gcc.dg/tree-ssa/sra-18.c > +++ b/gcc/testsuite/gcc.dg/tree-ssa/sra-18.c > @@ -1,6 +1,7 @@ > /* { dg-do run { target { aarch64*-*-* alpha*-*-* arm*-*-* hppa*-*-* > powerpc*-*-* s390*-*-* } } } */ > /* { dg-options "-O2 -fdump-tree-esra --param > sra-max-scalarization-size-Ospeed=32" } */ > /* { dg-additional-options "-mcpu=ev4" { target alpha*-*-* } } */ > +/* { dg-additional-options "-mno-vsx" { target { powerpc*-*-* && ilp32 } } } > */ > > extern void abort (void); > struct foo { long x; }; > diff --git a/gcc/testsuite/gcc.target/powerpc/pr111449-2.c > b/gcc/testsuite/gcc.target/powerpc/pr111449-2.c > new file mode 100644 > index 00000000000..7003bdc0208 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/powerpc/pr111449-2.c > @@ -0,0 +1,18 @@ > +/* { dg-do compile { target { has_arch_pwr8 } } } */ > +/* { dg-require-effective-target powerpc_p8vector_ok } */ > +/* { dg-options "-mvsx -O2" } */ > + > +/* Ensure 16-byte by pieces move is enabled. */ > + > +void move1 (void *s1, void *s2) > +{ > + __builtin_memcpy (s1, s2, 16); > +} > + > +void move2 (void *s1) > +{ > + __builtin_memcpy (s1, "0123456789012345", 16); > +} > + > +/* { dg-final { scan-assembler-times {\mlxvd2x\M|\mp?lxv\M} 2 } } */ > +/* { dg-final { scan-assembler-times {\mstxvd2x\M|\mstxv\M} 2 } } */