On Fri, Nov 10, 2017 at 11:45:23AM -0600, Segher Boessenkool wrote:
> Hi Mike,
> 
> On Wed, Nov 08, 2017 at 03:02:30PM -0500, Michael Meissner wrote:
> > > Should this somehow be joined with p9_xxbrd_<mode>?  Or maybe you should
> > > generate that, instead.
> > 
> > No, since p9_xxbrd_<mode> doesn't include DImode.  We could add yet another
> > mode iterator to include DI + V2DI/V2DF modes, but that seems to be 
> > overkill.
> 
> Having separate DI and V2DI patterns for the same instruction isn't great
> either.  If it is only this insn it is fine of course, but I suspect we'll
> get many more later.  Well I guess we can solve it later ;-)
> 
> > I simplified it to only change the one insn that would normally match the
> > register to register case to skip if ISA 3.0.  I put a test on bswapdi2_reg 
> > as
> > well.
> 
> Thanks.
> 
> > 
> > > > @@ -2507,6 +2513,8 @@ (define_expand "bswapdi2"
> > > >         emit_insn (gen_bswapdi2_load (dest, src));
> > > >        else if (MEM_P (dest))
> > > >         emit_insn (gen_bswapdi2_store (dest, src));
> > > > +      else if (TARGET_P9_VECTOR)
> > > > +       emit_insn (gen_bswapdi2_xxbrd (dest, src));
> > > >        else
> > > >         emit_insn (gen_bswapdi2_reg (dest, src));
> > > >        DONE;
> > > 
> > > Pity that you cannot easily do similar for HI and SI.
> > 
> > Not really.  Bswap64 generates 9 separate insns, while bswap32 and bswap16 
> > only
> > generate 3 insns.  So, having to add the move direct to/from the vector
> > registers would mean it would be slower than the normal case that is 
> > currently
> > generated.  But if the value happens to be in a VSX register, then we can 
> > do it
> > in one instruction.
> 
> I meant, have just two lines as above :-)
> 
> > After a burn-in period, I plan to backport a reduced version of the patch
> > (XXBRD only) to gcc 7, unless you think it shouldn't go into gcc 7.
> 
> Well, why do we want it on 7?

Advanced customers and the kernel team using pre-production power9 servers have
asked for it.  Since GCC 8 likely won't be out for several months, I figured to
ask for it to go into GCC 7.

> > +(define_insn "bswapdi2_xxbrd"
> > +  [(set (match_operand:DI 0 "gpc_reg_operand" "=wo")
> > +   (bswap:DI (match_operand:DI 1 "gpc_reg_operand" "wo")))]
> > +  "TARGET_POWERPC64 && TARGET_P9_VECTOR"
> > +  "xxbrd %x0,%x1"
> > +  [(set_attr "type" "vecperm")])
> 
> This doesn't need TARGET_POWERPC64 I think.
> 
> Please look at that.  The patch is okay for trunk.  Thanks!

I removed the test for POWERPC64.  It isn't needed, but I don't have any way of
testing whether the resulting code is slower or faster.  I'll attach the patch
of the changes I checked in.

-- 
Michael Meissner, IBM
IBM, M/S 2506R, 550 King Street, Littleton, MA 01460-6245, USA
email: meiss...@linux.vnet.ibm.com, phone: +1 (978) 899-4797
Index: gcc/config/rs6000/rs6000.md
===================================================================
--- gcc/config/rs6000/rs6000.md (revision 254637)
+++ gcc/config/rs6000/rs6000.md (working copy)
@@ -2432,13 +2432,15 @@ (define_insn "bswap<mode>2_store"
   [(set_attr "type" "store")])
 
 (define_insn_and_split "bswaphi2_reg"
-  [(set (match_operand:HI 0 "gpc_reg_operand" "=&r")
+  [(set (match_operand:HI 0 "gpc_reg_operand" "=&r,wo")
        (bswap:HI
-        (match_operand:HI 1 "gpc_reg_operand" "r")))
-   (clobber (match_scratch:SI 2 "=&r"))]
+        (match_operand:HI 1 "gpc_reg_operand" "r,wo")))
+   (clobber (match_scratch:SI 2 "=&r,X"))]
   ""
-  "#"
-  "reload_completed"
+  "@
+   #
+   xxbrh %x0,%x1"
+  "reload_completed && int_reg_operand (operands[0], HImode)"
   [(set (match_dup 3)
        (and:SI (lshiftrt:SI (match_dup 4)
                             (const_int 8))
@@ -2454,18 +2456,20 @@ (define_insn_and_split "bswaphi2_reg"
   operands[3] = simplify_gen_subreg (SImode, operands[0], HImode, 0);
   operands[4] = simplify_gen_subreg (SImode, operands[1], HImode, 0);
 }
-  [(set_attr "length" "12")
-   (set_attr "type" "*")])
+  [(set_attr "length" "12,4")
+   (set_attr "type" "*,vecperm")])
 
 ;; We are always BITS_BIG_ENDIAN, so the bit positions below in
 ;; zero_extract insns do not change for -mlittle.
 (define_insn_and_split "bswapsi2_reg"
-  [(set (match_operand:SI 0 "gpc_reg_operand" "=&r")
+  [(set (match_operand:SI 0 "gpc_reg_operand" "=&r,wo")
        (bswap:SI
-        (match_operand:SI 1 "gpc_reg_operand" "r")))]
+        (match_operand:SI 1 "gpc_reg_operand" "r,wo")))]
   ""
-  "#"
-  "reload_completed"
+  "@
+   #
+   xxbrw %x0,%x1"
+  "reload_completed && int_reg_operand (operands[0], SImode)"
   [(set (match_dup 0)                                  ; DABC
        (rotate:SI (match_dup 1)
                   (const_int 24)))
@@ -2481,7 +2485,9 @@ (define_insn_and_split "bswapsi2_reg"
                        (const_int 255))
                (and:SI (match_dup 0)
                        (const_int -256))))]
-  "")
+  ""
+  [(set_attr "length" "12,4")
+   (set_attr "type" "*,vecperm")])
 
 ;; On systems with LDBRX/STDBRX generate the loads/stores directly, just like
 ;; we do for L{H,W}BRX and ST{H,W}BRX above.  If not, we have to generate more
@@ -2507,11 +2513,19 @@ (define_expand "bswapdi2"
        emit_insn (gen_bswapdi2_load (dest, src));
       else if (MEM_P (dest))
        emit_insn (gen_bswapdi2_store (dest, src));
+      else if (TARGET_P9_VECTOR)
+       emit_insn (gen_bswapdi2_xxbrd (dest, src));
       else
        emit_insn (gen_bswapdi2_reg (dest, src));
       DONE;
     }
 
+  if (TARGET_P9_VECTOR && !MEM_P (src) && !MEM_P (dest))
+    {
+      emit_insn (gen_bswapdi2_xxbrd (dest, src));
+      DONE;
+    }
+
   if (!TARGET_POWERPC64)
     {
       /* 32-bit mode needs fewer scratch registers, but 32-bit addressing mode
@@ -2537,12 +2551,19 @@ (define_insn "bswapdi2_store"
   "stdbrx %1,%y0"
   [(set_attr "type" "store")])
 
+(define_insn "bswapdi2_xxbrd"
+  [(set (match_operand:DI 0 "gpc_reg_operand" "=wo")
+       (bswap:DI (match_operand:DI 1 "gpc_reg_operand" "wo")))]
+  "TARGET_P9_VECTOR"
+  "xxbrd %x0,%x1"
+  [(set_attr "type" "vecperm")])
+
 (define_insn "bswapdi2_reg"
   [(set (match_operand:DI 0 "gpc_reg_operand" "=&r")
        (bswap:DI (match_operand:DI 1 "gpc_reg_operand" "r")))
    (clobber (match_scratch:DI 2 "=&r"))
    (clobber (match_scratch:DI 3 "=&r"))]
-  "TARGET_POWERPC64 && TARGET_LDBRX"
+  "TARGET_POWERPC64 && TARGET_LDBRX && !TARGET_P9_VECTOR"
   "#"
   [(set_attr "length" "36")])
 
@@ -2691,7 +2712,7 @@ (define_split
        (bswap:DI (match_operand:DI 1 "gpc_reg_operand" "")))
    (clobber (match_operand:DI 2 "gpc_reg_operand" ""))
    (clobber (match_operand:DI 3 "gpc_reg_operand" ""))]
-  "TARGET_POWERPC64 && reload_completed"
+  "TARGET_POWERPC64 && !TARGET_P9_VECTOR && reload_completed"
   [(const_int 0)]
   "
 {
Index: gcc/testsuite/gcc.target/powerpc/p9-xxbr-3.c
===================================================================
--- gcc/testsuite/gcc.target/powerpc/p9-xxbr-3.c        (revision 0)
+++ gcc/testsuite/gcc.target/powerpc/p9-xxbr-3.c        (revision 0)
@@ -0,0 +1,99 @@
+/* { dg-do compile { target { powerpc*-*-* && lp64 } } } */
+/* { dg-require-effective-target powerpc_p9vector_ok } */
+/* { dg-options "-mpower9-vector -O2" } */
+
+/* Verify that the XXBR{H,W} instructions are generated if the value is
+   forced to be in a vector register, and XXBRD is generated all of the
+   time for register bswap64's.  */
+
+unsigned short
+do_bswap16_mem (unsigned short *p)
+{
+  return __builtin_bswap16 (*p);       /* LHBRX.  */
+}
+
+unsigned short
+do_bswap16_reg (unsigned short a)
+{
+  return __builtin_bswap16 (a);                /* gpr sequences.  */
+}
+
+void
+do_bswap16_store (unsigned short *p, unsigned short a)
+{
+  *p = __builtin_bswap16 (a);          /* STHBRX.  */
+}
+
+unsigned short
+do_bswap16_vect (unsigned short a)
+{
+  __asm__ (" # %x0" : "+v" (a));
+  return __builtin_bswap16 (a);                /* XXBRW.  */
+}
+
+unsigned int
+do_bswap32_mem (unsigned int *p)
+{
+  return __builtin_bswap32 (*p);       /* LWBRX.  */
+}
+
+unsigned int
+do_bswap32_reg (unsigned int a)
+{
+  return __builtin_bswap32 (a);                /* gpr sequences.  */
+}
+
+void
+do_bswap32_store (unsigned int *p, unsigned int a)
+{
+  *p = __builtin_bswap32 (a);          /* STWBRX.  */
+}
+
+unsigned int
+do_bswap32_vect (unsigned int a)
+{
+  __asm__ (" # %x0" : "+v" (a));
+  return __builtin_bswap32 (a);                /* XXBRW.  */
+}
+
+unsigned long
+do_bswap64_mem (unsigned long *p)
+{
+  return __builtin_bswap64 (*p);       /* LDBRX.  */
+}
+
+unsigned long
+do_bswap64_reg (unsigned long a)
+{
+  return __builtin_bswap64 (a);                /* gpr sequences.  */
+}
+
+void
+do_bswap64_store (unsigned long *p, unsigned int a)
+{
+  *p = __builtin_bswap64 (a);          /* STDBRX.  */
+}
+
+double
+do_bswap64_double (unsigned long a)
+{
+  return (double) __builtin_bswap64 (a);       /* XXBRD.  */
+}
+
+unsigned long
+do_bswap64_vect (unsigned long a)
+{
+  __asm__ (" # %x0" : "+v" (a));       /* XXBRD.  */
+  return __builtin_bswap64 (a);
+}
+
+/* Make sure XXBR{H,W,D} is not generated by default.  */
+/* { dg-final { scan-assembler-times "xxbrd"  3  } } */
+/* { dg-final { scan-assembler-times "xxbrh"  1  } } */
+/* { dg-final { scan-assembler-times "xxbrw"  1  } } */
+/* { dg-final { scan-assembler-times "ldbrx"  1  } } */
+/* { dg-final { scan-assembler-times "lhbrx"  1  } } */
+/* { dg-final { scan-assembler-times "lwbrx"  1  } } */
+/* { dg-final { scan-assembler-times "stdbrx" 1  } } */
+/* { dg-final { scan-assembler-times "sthbrx" 1  } } */
+/* { dg-final { scan-assembler-times "stwbrx" 1  } } */

Reply via email to