On Fri, 2022-05-13 at 11:08 -0400, Michael Meissner wrote:
> Add zero_extendditi2. Improve lxvr*x code generation.
>
Hi,
> Subject: Re: [PATCH] Delay splitting addti3/subti3 until first split
pass.
Subject does not seem to match contents?
> This pattern adds zero_extendditi2 so that if we are extending DImode that
> is in a GPR register to TImode in a vector register, the compiler can
> generate MTVSRDDD.
Just "mtvsrdd".
>
> In addition the patterns for generating lxvr{b,h,w,d}x were tuned to allow
> loading to gpr registers. This prevents needlessly doing direct moves to
> get the value into the vector registers if the gpr register was already
> selected.
>
> In updating the insn counts for two tests due to these changes, I noticed
> the tests were done at -O0. I changed this so that the tests are now done
> at the normal -O2 optimization level.
>
s/normal/default/ ?
> This patch will be needed for an upcoming patch for PR target/103109.
>
ok
> I have built this patch on little endian power10, little endian power9,
> and big endian power8 systems. There were no regressions with this
> patch. Can I install this on the GCC 13 trunk?
>
> 2022-05-013 Michael Meissner <[email protected]>
>
> gcc/
> * config/rs6000/vsx.md (vsx_lxvr<wd>x): Add support for loading to
> GPR registers.
> (vsx_stxvr<wd>x): Add support for storing from GPR registers.
swap the froms and tos ?
> (zero_extendditi2): New insn.
>
> gcc/testsuite/
> * gcc.target/powerpc/vsx-load-element-extend-int.c: Use -O2
> instead of -O0 and update insn counts.
> * gcc.target/powerpc/vsx-load-element-extend-short.c: Likewise.
> * gcc.target/powerpc/zero-extend-di-ti.c: New test.
> ---
> gcc/config/rs6000/vsx.md | 82 +++++++++++++++++--
> .../powerpc/vsx-load-element-extend-int.c | 36 ++++----
> .../powerpc/vsx-load-element-extend-short.c | 35 ++++----
> .../gcc.target/powerpc/zero-extend-di-ti.c | 62 ++++++++++++++
> 4 files changed, 164 insertions(+), 51 deletions(-)
> create mode 100644 gcc/testsuite/gcc.target/powerpc/zero-extend-di-ti.c
>
> diff --git a/gcc/config/rs6000/vsx.md b/gcc/config/rs6000/vsx.md
> index c091e5e2f47..ad971e3a1de 100644
> --- a/gcc/config/rs6000/vsx.md
> +++ b/gcc/config/rs6000/vsx.md
> @@ -1315,14 +1315,32 @@ (define_expand "vsx_store_<mode>"
> }
> })
>
> -;; Load rightmost element from load_data
> -;; using lxvrbx, lxvrhx, lxvrwx, lxvrdx.
> -(define_insn "vsx_lxvr<wd>x"
> - [(set (match_operand:TI 0 "vsx_register_operand" "=wa")
> - (zero_extend:TI (match_operand:INT_ISA3 1 "memory_operand" "Z")))]
> - "TARGET_POWER10"
> - "lxvr<wd>x %x0,%y1"
> - [(set_attr "type" "vecload")])
> +;; Load rightmost element from load_data using lxvrbx, lxvrhx, lxvrwx,
> lxvrdx.
> +;; Support TImode being in a GPR register to prevent generating lvxr{d,w,b}x
> +;; and then two direct moves if we ultimately need the value in a GPR
> register.
Perhaps break into two sentences and split the description of what is
prevented in a separate sentence. ?
> +(define_insn_and_split "vsx_lxvr<wd>x"
> + [(set (match_operand:TI 0 "register_operand" "=r,wa")
> + (zero_extend:TI (match_operand:INT_ISA3 1 "memory_operand" "m,Z")))]
> + "TARGET_POWERPC64 && TARGET_POWER10"
> + "@
> + #
> + lxvr<wd>x %x0,%y1"
> + "&& reload_completed && int_reg_operand (operands[0], TImode)"
> + [(set (match_dup 2) (match_dup 3))
> + (set (match_dup 4) (const_int 0))]
> +{
> + rtx op0 = operands[0];
> + rtx op1 = operands[1];
> +
> + operands[2] = gen_lowpart (DImode, op0);
> + operands[3] = (<MODE>mode == DImode
> + ? op1
> + : gen_rtx_ZERO_EXTEND (DImode, op1));
> +
> + operands[4] = gen_highpart (DImode, op0);
> +}
> + [(set_attr "type" "load,vecload")
> + (set_attr "num_insns" "2,*")])
>
> ;; Store rightmost element into store_data
> ;; using stxvrbx, stxvrhx, strvxwx, strvxdx.
> @@ -5019,6 +5037,54 @@ (define_expand "vsignextend_si_v2di"
> DONE;
> })
>
> +;; Zero extend DI to TI. If we don't have the MTVSRDD instruction (and
> LXVRDX
> +;; in the case of power10), we use the machine independent code. If we are
> +;; loading up GPRs, we fall back to the old code.
Will 'old code' have meaning to future readers of this lump of code?
> +(define_insn_and_split "zero_extendditi2"
> + [(set (match_operand:TI 0 "register_operand" "=r,r,
> wa,&wa")
> + (zero_extend:TI (match_operand:DI 1 "register_operand" "r,wa,r,
> wa")))]
> + "TARGET_POWERPC64 && TARGET_P9_VECTOR"
> + "@
> + #
> + #
> + mtvsrdd %x0,0,%1
> + #"
> + "&& reload_completed
> + && (int_reg_operand (operands[0], TImode)
> + || vsx_register_operand (operands[1], DImode))"
> + [(pc)]
> +{
> + rtx dest = operands[0];
> + rtx src = operands[1];
> + int dest_regno = reg_or_subregno (dest);
> +
> + /* Handle conversion to GPR registers. Load up the low part and then do
> + zero out the upper part. */
s/do//
> + if (INT_REGNO_P (dest_regno))
> + {
> + rtx dest_hi = gen_highpart (DImode, dest);
> + rtx dest_lo = gen_lowpart (DImode, dest);
> +
> + emit_move_insn (dest_lo, src);
> + emit_move_insn (dest_hi, const0_rtx);
> + DONE;
> + }
> +
> + /* For settomg a VSX register from another VSX register, clear the result
> + register, and use XXPERMDI to shift the value into the lower 64-bits.
> */
setting
> + rtx dest_v2di = gen_rtx_REG (V2DImode, dest_regno);
> + rtx dest_di = gen_rtx_REG (DImode, dest_regno);
> +
> + emit_move_insn (dest_v2di, CONST0_RTX (V2DImode));
> + if (BYTES_BIG_ENDIAN)
> + emit_insn (gen_vsx_concat_v2di (dest_v2di, dest_di, src));
> + else
> + emit_insn (gen_vsx_concat_v2di (dest_v2di, src, dest_di));
> + DONE;
> +}
> + [(set_attr "type" "integer,mfvsr,vecmove,vecperm")
> + (set_attr "length" "8, 8, *, 8")])
> +
> ;; Sign extend DI to TI. We provide both GPR targets and Altivec targets on
> ;; power10. On earlier systems, the machine independent code will generate a
> ;; shift left to sign extend the 64-bit value to 128-bit.
> diff --git a/gcc/testsuite/gcc.target/powerpc/vsx-load-element-extend-int.c
> b/gcc/testsuite/gcc.target/powerpc/vsx-load-element-extend-int.c
> index c40e1a3a0f7..1f1281d6b75 100644
> --- a/gcc/testsuite/gcc.target/powerpc/vsx-load-element-extend-int.c
> +++ b/gcc/testsuite/gcc.target/powerpc/vsx-load-element-extend-int.c
> @@ -6,33 +6,25 @@
> /* { dg-do compile { target { ! power10_hw } } } */
> /* { dg-require-effective-target power10_ok } */
> /* { dg-require-effective-target int128 } */
> -
> -/* Deliberately set optization to zero for this test to confirm
> - the lxvr*x instruction is generated. At higher optimization levels
> - the instruction we are looking for is sometimes replaced by other
> - load instructions. */
> -/* { dg-options "-mdejagnu-cpu=power10 -O0 -save-temps" } */
> -
> +/* { dg-options "-mdejagnu-cpu=power10 -O2 -save-temps" } */
ok
> /* { dg-final { scan-assembler-times {\mlxvrwx\M} 2 } } */
>
> #define NUM_VEC_ELEMS 4
> #define ITERS 16
>
> -/*
> -Codegen at time of writing is a single lxvrwx for the zero
> -extended test, and a lwax,mtvsrdd,vextsd2q for the sign
> -extended test.
> -
> -0000000010000c90 <test_sign_extended_load>:
> - 10000c90: aa 1a 24 7d lwax r9,r4,r3
> - 10000c94: 67 4b 40 7c mtvsrdd vs34,0,r9
> - 10000c98: 02 16 5b 10 vextsd2q v2,v2
> - 10000c9c: 20 00 80 4e blr
> -
> -0000000010000cb0 <test_zero_extended_unsigned_load>:
> - 10000cb0: 9b 18 44 7c lxvrwx vs34,r4,r3
> - 10000cb4: 20 00 80 4e blr
> -*/
> +/* Codegen at time of writing is a single lxvrwx for the zero extended test,
> + and a lxvrwx + vexts* sign extension instructions for the sign extended
> + test.
> +
> + 0000000000000000 <test_sign_extended_load>:
> + 0: 9b 18 44 7c lxvrwx vs34,r4,r3
> + 4: 02 16 5a 10 vextsw2d v2,v2
> + 8: 02 16 5b 10 vextsd2q v2,v2
> + c: 20 00 80 4e blr
> +
> + 0000000000000020 <test_zero_extended_unsigned_load>:
> + 20: 9b 18 44 7c lxvrwx vs34,r4,r3
> + 24: 20 00 80 4e blr */
>
> #include <altivec.h>
> #include <stdio.h>
> diff --git a/gcc/testsuite/gcc.target/powerpc/vsx-load-element-extend-short.c
> b/gcc/testsuite/gcc.target/powerpc/vsx-load-element-extend-short.c
> index 837ba79c9ab..a7721318812 100644
> --- a/gcc/testsuite/gcc.target/powerpc/vsx-load-element-extend-short.c
> +++ b/gcc/testsuite/gcc.target/powerpc/vsx-load-element-extend-short.c
> @@ -6,33 +6,26 @@
> /* { dg-do compile { target { ! power10_hw } } } */
> /* { dg-require-effective-target power10_ok } */
> /* { dg-require-effective-target int128 } */
> -
> -/* Deliberately set optization to zero for this test to confirm
> - the lxvr*x instruction is generated. At higher optimization levels
> - the instruction we are looking for is sometimes replaced by other
> - load instructions. */
> -/* { dg-options "-mdejagnu-cpu=power10 -O0 -save-temps" } */
> +/* { dg-options "-mdejagnu-cpu=power10 -O2 -save-temps" } */
>
> /* { dg-final { scan-assembler-times {\mlxvrhx\M} 2 } } */
ok
Nothing else jumped out at me from the below.
thanks
-Will
>
> #define NUM_VEC_ELEMS 8
> #define ITERS 16
>
> -/*
> -Codegen at time of writing uses lxvrhx for the zero
> -extension test and lhax,mtvsrdd,vextsd2q for the
> -sign extended test.
> -
> -0000000010001810 <test_sign_extended_load>:
> - 10001810: ae 1a 24 7d lhax r9,r4,r3
> - 10001814: 67 4b 40 7c mtvsrdd vs34,0,r9
> - 10001818: 02 16 5b 10 vextsd2q v2,v2
> - 1000181c: 20 00 80 4e blr
> -
> -0000000010001830 <test_zero_extended_unsigned_load>:
> - 10001830: 5b 18 44 7c lxvrhx vs34,r4,r3
> - 10001834: 20 00 80 4e blr
> -*/
> +/* Codegen at time of writing is a single lxvrwx for the zero extended test,
> + and a lxvrwx + vexts* sign extension instructions for the sign extended
> + test.
> +
> + 0000000000000000 <test_sign_extended_load>:
> + 0: 5b 18 44 7c lxvrhx vs34,r4,r3
> + 4: 02 16 59 10 vextsh2d v2,v2
> + 8: 02 16 5b 10 vextsd2q v2,v2
> + c: 20 00 80 4e blr
> +
> + 0000000000000020 <test_zero_extended_unsigned_load>:
> + 20: 5b 18 44 7c lxvrhx vs34,r4,r3
> + 24: 20 00 80 4e blr */
>
> #include <altivec.h>
> #include <stdio.h>
> diff --git a/gcc/testsuite/gcc.target/powerpc/zero-extend-di-ti.c
> b/gcc/testsuite/gcc.target/powerpc/zero-extend-di-ti.c
> new file mode 100644
> index 00000000000..9b3b9c4dbd0
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/powerpc/zero-extend-di-ti.c
> @@ -0,0 +1,62 @@
> +/* { dg-require-effective-target int128 } */
> +/* { dg-require-effective-target power10_ok } */
> +/* { dg-options "-mdejagnu-cpu=power10 -O2" } */
> +
> +/* This patch makes sure the various optimization and code paths are done for
> + zero extending DImode to TImode on power10. */
> +
> +__uint128_t
> +gpr_to_gpr (unsigned long long a)
> +{
> + /* li 4,0. */
> + return a;
> +}
> +
> +__uint128_t
> +mem_to_gpr (unsigned long long *p)
> +{
> + /* ld 3,0(3); li 4,0. */
> + return *p;
> +}
> +
> +__uint128_t
> +vsx_to_gpr (__uint128_t *p, double d)
> +{
> + /* fctiduz 1,1; li 4,0;mfvsrd 3,1. */
> + return (unsigned long long)d;
> +}
> +
> +void
> +gpr_to_vsx (__uint128_t *p, unsigned long long a)
> +{
> + /* mtvsrdd 0,0,4; stxv 0,0(3). */
> + __uint128_t b = a;
> + __asm__ (" # %x0" : "+wa" (b));
> + *p = b;
> +}
> +
> +void
> +mem_to_vsx (__uint128_t *p, unsigned long long *q)
> +{
> + /* lxvrdx 0,0,4; stxv 0,0(3). */
> + __uint128_t a = *q;
> + __asm__ (" # %x0" : "+wa" (a));
> + *p = a;
> +}
> +
> +void
> +vsx_to_vsx (__uint128_t *p, double d)
> +{
> + /* fctiduz 1,1; xxspltib 0,0; xxpermdi 0,0,1,0; stxv 0,0(3). */
> + __uint128_t a = (unsigned long long)d;
> + __asm__ (" # %x0" : "+wa" (a));
> + *p = a;
> +}
> +
> +/* { dg-final { scan-assembler-times {\mli\M} 3 } } */
> +/* { dg-final { scan-assembler-times {\mld\M} 1 } } */
> +/* { dg-final { scan-assembler-times {\mlxvrdx\M} 1 } } */
> +/* { dg-final { scan-assembler-times {\mmfvsrd\M} 1 } } */
> +/* { dg-final { scan-assembler-times {\mmtvsrdd\M} 1 } } */
> +/* { dg-final { scan-assembler-times {\mstxv\M} 3 } } */
> +/* { dg-final { scan-assembler-times {\mxxpermdi\M} 1 } } */
> --
> 2.35.3
>
>