On Fri, Nov 10, 2017 at 11:45:23AM -0600, Segher Boessenkool wrote: > Hi Mike, > > On Wed, Nov 08, 2017 at 03:02:30PM -0500, Michael Meissner wrote: > > > Should this somehow be joined with p9_xxbrd_<mode>? Or maybe you should > > > generate that, instead. > > > > No, since p9_xxbrd_<mode> doesn't include DImode. We could add yet another > > mode iterator to include DI + V2DI/V2DF modes, but that seems to be > > overkill. > > Having separate DI and V2DI patterns for the same instruction isn't great > either. If it is only this insn it is fine of course, but I suspect we'll > get many more later. Well I guess we can solve it later ;-) > > > I simplified it to only change the one insn that would normally match the > > register to register case to skip if ISA 3.0. I put a test on bswapdi2_reg > > as > > well. > > Thanks. > > > > > > > @@ -2507,6 +2513,8 @@ (define_expand "bswapdi2" > > > > emit_insn (gen_bswapdi2_load (dest, src)); > > > > else if (MEM_P (dest)) > > > > emit_insn (gen_bswapdi2_store (dest, src)); > > > > + else if (TARGET_P9_VECTOR) > > > > + emit_insn (gen_bswapdi2_xxbrd (dest, src)); > > > > else > > > > emit_insn (gen_bswapdi2_reg (dest, src)); > > > > DONE; > > > > > > Pity that you cannot easily do similar for HI and SI. > > > > Not really. Bswap64 generates 9 separate insns, while bswap32 and bswap16 > > only > > generate 3 insns. So, having to add the move direct to/from the vector > > registers would mean it would be slower than the normal case that is > > currently > > generated. But if the value happens to be in a VSX register, then we can > > do it > > in one instruction. > > I meant, have just two lines as above :-) > > > After a burn-in period, I plan to backport a reduced version of the patch > > (XXBRD only) to gcc 7, unless you think it shouldn't go into gcc 7. > > Well, why do we want it on 7?
Advanced customers and the kernel team using pre-production power9 servers have asked for it. Since GCC 8 likely won't be out for several months, I figured to ask for it to go into GCC 7. > > +(define_insn "bswapdi2_xxbrd" > > + [(set (match_operand:DI 0 "gpc_reg_operand" "=wo") > > + (bswap:DI (match_operand:DI 1 "gpc_reg_operand" "wo")))] > > + "TARGET_POWERPC64 && TARGET_P9_VECTOR" > > + "xxbrd %x0,%x1" > > + [(set_attr "type" "vecperm")]) > > This doesn't need TARGET_POWERPC64 I think. > > Please look at that. The patch is okay for trunk. Thanks! I removed the test for POWERPC64. It isn't needed, but I don't have any way of testing whether the resulting code is slower or faster. I'll attach the patch of the changes I checked in. -- Michael Meissner, IBM IBM, M/S 2506R, 550 King Street, Littleton, MA 01460-6245, USA email: meiss...@linux.vnet.ibm.com, phone: +1 (978) 899-4797
Index: gcc/config/rs6000/rs6000.md =================================================================== --- gcc/config/rs6000/rs6000.md (revision 254637) +++ gcc/config/rs6000/rs6000.md (working copy) @@ -2432,13 +2432,15 @@ (define_insn "bswap<mode>2_store" [(set_attr "type" "store")]) (define_insn_and_split "bswaphi2_reg" - [(set (match_operand:HI 0 "gpc_reg_operand" "=&r") + [(set (match_operand:HI 0 "gpc_reg_operand" "=&r,wo") (bswap:HI - (match_operand:HI 1 "gpc_reg_operand" "r"))) - (clobber (match_scratch:SI 2 "=&r"))] + (match_operand:HI 1 "gpc_reg_operand" "r,wo"))) + (clobber (match_scratch:SI 2 "=&r,X"))] "" - "#" - "reload_completed" + "@ + # + xxbrh %x0,%x1" + "reload_completed && int_reg_operand (operands[0], HImode)" [(set (match_dup 3) (and:SI (lshiftrt:SI (match_dup 4) (const_int 8)) @@ -2454,18 +2456,20 @@ (define_insn_and_split "bswaphi2_reg" operands[3] = simplify_gen_subreg (SImode, operands[0], HImode, 0); operands[4] = simplify_gen_subreg (SImode, operands[1], HImode, 0); } - [(set_attr "length" "12") - (set_attr "type" "*")]) + [(set_attr "length" "12,4") + (set_attr "type" "*,vecperm")]) ;; We are always BITS_BIG_ENDIAN, so the bit positions below in ;; zero_extract insns do not change for -mlittle. (define_insn_and_split "bswapsi2_reg" - [(set (match_operand:SI 0 "gpc_reg_operand" "=&r") + [(set (match_operand:SI 0 "gpc_reg_operand" "=&r,wo") (bswap:SI - (match_operand:SI 1 "gpc_reg_operand" "r")))] + (match_operand:SI 1 "gpc_reg_operand" "r,wo")))] "" - "#" - "reload_completed" + "@ + # + xxbrw %x0,%x1" + "reload_completed && int_reg_operand (operands[0], SImode)" [(set (match_dup 0) ; DABC (rotate:SI (match_dup 1) (const_int 24))) @@ -2481,7 +2485,9 @@ (define_insn_and_split "bswapsi2_reg" (const_int 255)) (and:SI (match_dup 0) (const_int -256))))] - "") + "" + [(set_attr "length" "12,4") + (set_attr "type" "*,vecperm")]) ;; On systems with LDBRX/STDBRX generate the loads/stores directly, just like ;; we do for L{H,W}BRX and ST{H,W}BRX above. If not, we have to generate more @@ -2507,11 +2513,19 @@ (define_expand "bswapdi2" emit_insn (gen_bswapdi2_load (dest, src)); else if (MEM_P (dest)) emit_insn (gen_bswapdi2_store (dest, src)); + else if (TARGET_P9_VECTOR) + emit_insn (gen_bswapdi2_xxbrd (dest, src)); else emit_insn (gen_bswapdi2_reg (dest, src)); DONE; } + if (TARGET_P9_VECTOR && !MEM_P (src) && !MEM_P (dest)) + { + emit_insn (gen_bswapdi2_xxbrd (dest, src)); + DONE; + } + if (!TARGET_POWERPC64) { /* 32-bit mode needs fewer scratch registers, but 32-bit addressing mode @@ -2537,12 +2551,19 @@ (define_insn "bswapdi2_store" "stdbrx %1,%y0" [(set_attr "type" "store")]) +(define_insn "bswapdi2_xxbrd" + [(set (match_operand:DI 0 "gpc_reg_operand" "=wo") + (bswap:DI (match_operand:DI 1 "gpc_reg_operand" "wo")))] + "TARGET_P9_VECTOR" + "xxbrd %x0,%x1" + [(set_attr "type" "vecperm")]) + (define_insn "bswapdi2_reg" [(set (match_operand:DI 0 "gpc_reg_operand" "=&r") (bswap:DI (match_operand:DI 1 "gpc_reg_operand" "r"))) (clobber (match_scratch:DI 2 "=&r")) (clobber (match_scratch:DI 3 "=&r"))] - "TARGET_POWERPC64 && TARGET_LDBRX" + "TARGET_POWERPC64 && TARGET_LDBRX && !TARGET_P9_VECTOR" "#" [(set_attr "length" "36")]) @@ -2691,7 +2712,7 @@ (define_split (bswap:DI (match_operand:DI 1 "gpc_reg_operand" ""))) (clobber (match_operand:DI 2 "gpc_reg_operand" "")) (clobber (match_operand:DI 3 "gpc_reg_operand" ""))] - "TARGET_POWERPC64 && reload_completed" + "TARGET_POWERPC64 && !TARGET_P9_VECTOR && reload_completed" [(const_int 0)] " { Index: gcc/testsuite/gcc.target/powerpc/p9-xxbr-3.c =================================================================== --- gcc/testsuite/gcc.target/powerpc/p9-xxbr-3.c (revision 0) +++ gcc/testsuite/gcc.target/powerpc/p9-xxbr-3.c (revision 0) @@ -0,0 +1,99 @@ +/* { dg-do compile { target { powerpc*-*-* && lp64 } } } */ +/* { dg-require-effective-target powerpc_p9vector_ok } */ +/* { dg-options "-mpower9-vector -O2" } */ + +/* Verify that the XXBR{H,W} instructions are generated if the value is + forced to be in a vector register, and XXBRD is generated all of the + time for register bswap64's. */ + +unsigned short +do_bswap16_mem (unsigned short *p) +{ + return __builtin_bswap16 (*p); /* LHBRX. */ +} + +unsigned short +do_bswap16_reg (unsigned short a) +{ + return __builtin_bswap16 (a); /* gpr sequences. */ +} + +void +do_bswap16_store (unsigned short *p, unsigned short a) +{ + *p = __builtin_bswap16 (a); /* STHBRX. */ +} + +unsigned short +do_bswap16_vect (unsigned short a) +{ + __asm__ (" # %x0" : "+v" (a)); + return __builtin_bswap16 (a); /* XXBRW. */ +} + +unsigned int +do_bswap32_mem (unsigned int *p) +{ + return __builtin_bswap32 (*p); /* LWBRX. */ +} + +unsigned int +do_bswap32_reg (unsigned int a) +{ + return __builtin_bswap32 (a); /* gpr sequences. */ +} + +void +do_bswap32_store (unsigned int *p, unsigned int a) +{ + *p = __builtin_bswap32 (a); /* STWBRX. */ +} + +unsigned int +do_bswap32_vect (unsigned int a) +{ + __asm__ (" # %x0" : "+v" (a)); + return __builtin_bswap32 (a); /* XXBRW. */ +} + +unsigned long +do_bswap64_mem (unsigned long *p) +{ + return __builtin_bswap64 (*p); /* LDBRX. */ +} + +unsigned long +do_bswap64_reg (unsigned long a) +{ + return __builtin_bswap64 (a); /* gpr sequences. */ +} + +void +do_bswap64_store (unsigned long *p, unsigned int a) +{ + *p = __builtin_bswap64 (a); /* STDBRX. */ +} + +double +do_bswap64_double (unsigned long a) +{ + return (double) __builtin_bswap64 (a); /* XXBRD. */ +} + +unsigned long +do_bswap64_vect (unsigned long a) +{ + __asm__ (" # %x0" : "+v" (a)); /* XXBRD. */ + return __builtin_bswap64 (a); +} + +/* Make sure XXBR{H,W,D} is not generated by default. */ +/* { dg-final { scan-assembler-times "xxbrd" 3 } } */ +/* { dg-final { scan-assembler-times "xxbrh" 1 } } */ +/* { dg-final { scan-assembler-times "xxbrw" 1 } } */ +/* { dg-final { scan-assembler-times "ldbrx" 1 } } */ +/* { dg-final { scan-assembler-times "lhbrx" 1 } } */ +/* { dg-final { scan-assembler-times "lwbrx" 1 } } */ +/* { dg-final { scan-assembler-times "stdbrx" 1 } } */ +/* { dg-final { scan-assembler-times "sthbrx" 1 } } */ +/* { dg-final { scan-assembler-times "stwbrx" 1 } } */