PowerPC ISA 3.0 does not have a byte-reverse instruction that operates on the
GPRs, but it does have vector byte swap half-word, word, double-word operations
in the VSX registers. The enclosed patch enables generation of the byte
revseral instructions for register-register operations. It still prefers to
generate the load with byte reverse (L{H,W,D}BRX) or store with byte reverse
(ST{H,W,D}BRX) instructions over the register sequence.
For 16-bit and 32-bit byte swaps, it typically does the tradational operation
in GPR registers, but it will generate XXBR{H,W} if the values are in vector
registers.
For 64-bit swaps, it no longer generates the 9 instruction sequence in favor of
XXBRD. I did some timing runs on a prototype power9 system, and it was
slightly faster to do direct move to the vecter unit, XXBRD, and direct move
back to a GPR than the traditional sequence.
I did bootstraps on little endian Power8 and Power9 systems (with the default
cpu set to power8 and power9 respectively). There were no regressions. Can I
check this patch into the trunk?
[gcc]
2017-11-08 Michael Meissner <[email protected]>
* config/rs6000/rs6000.md (bswaphi2_reg): On ISA 3.0 systems,
enable generating XXBR{H,W} if the value is in a vector
register.
(bswapsi2_reg): Likewise.
(bswapdi2_reg): On ISA 3.0 systems, use XXBRD to do bswap64
instead of doing the GPR sequence used on previoius machines.
(bswapdi2_xxbrd): Likewise.
(bswapdi2_reg splitters): Use int_reg_operand instead of
gpc_reg_operand to not match when XXBRD is generated.
[gcc/testsuite]
2017-11-08 Michael Meissner <[email protected]>
* gcc.target/powerpc/p9-xxbr-3.c: New test.
--
Michael Meissner, IBM
IBM, M/S 2506R, 550 King Street, Littleton, MA 01460-6245, USA
email: [email protected], phone: +1 (978) 899-4797
Index: gcc/config/rs6000/rs6000.md
===================================================================
--- gcc/config/rs6000/rs6000.md (revision 254516)
+++ gcc/config/rs6000/rs6000.md (working copy)
@@ -2432,13 +2432,15 @@ (define_insn "bswap<mode>2_store"
[(set_attr "type" "store")])
(define_insn_and_split "bswaphi2_reg"
- [(set (match_operand:HI 0 "gpc_reg_operand" "=&r")
+ [(set (match_operand:HI 0 "gpc_reg_operand" "=&r,wo")
(bswap:HI
- (match_operand:HI 1 "gpc_reg_operand" "r")))
- (clobber (match_scratch:SI 2 "=&r"))]
+ (match_operand:HI 1 "gpc_reg_operand" "r,wo")))
+ (clobber (match_scratch:SI 2 "=&r,X"))]
""
- "#"
- "reload_completed"
+ "@
+ #
+ xxbrh %x0,%x1"
+ "reload_completed && int_reg_operand (operands[0], HImode)"
[(set (match_dup 3)
(and:SI (lshiftrt:SI (match_dup 4)
(const_int 8))
@@ -2454,18 +2456,20 @@ (define_insn_and_split "bswaphi2_reg"
operands[3] = simplify_gen_subreg (SImode, operands[0], HImode, 0);
operands[4] = simplify_gen_subreg (SImode, operands[1], HImode, 0);
}
- [(set_attr "length" "12")
- (set_attr "type" "*")])
+ [(set_attr "length" "12,4")
+ (set_attr "type" "*,vecperm")])
;; We are always BITS_BIG_ENDIAN, so the bit positions below in
;; zero_extract insns do not change for -mlittle.
(define_insn_and_split "bswapsi2_reg"
- [(set (match_operand:SI 0 "gpc_reg_operand" "=&r")
+ [(set (match_operand:SI 0 "gpc_reg_operand" "=&r,wo")
(bswap:SI
- (match_operand:SI 1 "gpc_reg_operand" "r")))]
+ (match_operand:SI 1 "gpc_reg_operand" "r,wo")))]
""
- "#"
- "reload_completed"
+ "@
+ #
+ xxbrw %x0,%x1"
+ "reload_completed && int_reg_operand (operands[0], SImode)"
[(set (match_dup 0) ; DABC
(rotate:SI (match_dup 1)
(const_int 24)))
@@ -2481,7 +2485,9 @@ (define_insn_and_split "bswapsi2_reg"
(const_int 255))
(and:SI (match_dup 0)
(const_int -256))))]
- "")
+ ""
+ [(set_attr "length" "12,4")
+ (set_attr "type" "*,vecperm")])
;; On systems with LDBRX/STDBRX generate the loads/stores directly, just like
;; we do for L{H,W}BRX and ST{H,W}BRX above. If not, we have to generate more
@@ -2507,6 +2513,8 @@ (define_expand "bswapdi2"
emit_insn (gen_bswapdi2_load (dest, src));
else if (MEM_P (dest))
emit_insn (gen_bswapdi2_store (dest, src));
+ else if (TARGET_P9_VECTOR)
+ emit_insn (gen_bswapdi2_xxbrd (dest, src));
else
emit_insn (gen_bswapdi2_reg (dest, src));
DONE;
@@ -2537,6 +2545,13 @@ (define_insn "bswapdi2_store"
"stdbrx %1,%y0"
[(set_attr "type" "store")])
+(define_insn "bswapdi2_xxbrd"
+ [(set (match_operand:DI 0 "gpc_reg_operand" "=wo")
+ (bswap:DI (match_operand:DI 1 "gpc_reg_operand" "wo")))]
+ "TARGET_POWERPC64 && TARGET_P9_VECTOR"
+ "xxbrd %x0,%x1"
+ [(set_attr "type" "vecperm")])
+
(define_insn "bswapdi2_reg"
[(set (match_operand:DI 0 "gpc_reg_operand" "=&r")
(bswap:DI (match_operand:DI 1 "gpc_reg_operand" "r")))
@@ -2544,7 +2559,8 @@ (define_insn "bswapdi2_reg"
(clobber (match_scratch:DI 3 "=&r"))]
"TARGET_POWERPC64 && TARGET_LDBRX"
"#"
- [(set_attr "length" "36")])
+ [(set_attr "length" "36")
+ (set_attr "type" "*")])
;; Non-power7/cell, fall back to use lwbrx/stwbrx
(define_insn "*bswapdi2_64bit"
@@ -2560,7 +2576,7 @@ (define_insn "*bswapdi2_64bit"
[(set_attr "length" "16,12,36")])
(define_split
- [(set (match_operand:DI 0 "gpc_reg_operand" "")
+ [(set (match_operand:DI 0 "int_reg_operand" "")
(bswap:DI (match_operand:DI 1 "indexed_or_indirect_operand" "")))
(clobber (match_operand:DI 2 "gpc_reg_operand" ""))
(clobber (match_operand:DI 3 "gpc_reg_operand" ""))]
@@ -2625,7 +2641,7 @@ (define_split
(define_split
[(set (match_operand:DI 0 "indexed_or_indirect_operand" "")
- (bswap:DI (match_operand:DI 1 "gpc_reg_operand" "")))
+ (bswap:DI (match_operand:DI 1 "int_reg_operand" "")))
(clobber (match_operand:DI 2 "gpc_reg_operand" ""))
(clobber (match_operand:DI 3 "gpc_reg_operand" ""))]
"TARGET_POWERPC64 && !TARGET_LDBRX && reload_completed"
@@ -2687,10 +2703,10 @@ (define_split
}")
(define_split
- [(set (match_operand:DI 0 "gpc_reg_operand" "")
- (bswap:DI (match_operand:DI 1 "gpc_reg_operand" "")))
- (clobber (match_operand:DI 2 "gpc_reg_operand" ""))
- (clobber (match_operand:DI 3 "gpc_reg_operand" ""))]
+ [(set (match_operand:DI 0 "int_reg_operand" "")
+ (bswap:DI (match_operand:DI 1 "int_reg_operand" "")))
+ (clobber (match_operand:DI 2 "int_reg_operand" ""))
+ (clobber (match_operand:DI 3 "int_reg_operand" ""))]
"TARGET_POWERPC64 && reload_completed"
[(const_int 0)]
"
@@ -2722,9 +2738,9 @@ (define_insn "bswapdi2_32bit"
[(set_attr "length" "16,12,36")])
(define_split
- [(set (match_operand:DI 0 "gpc_reg_operand" "")
+ [(set (match_operand:DI 0 "int_reg_operand" "")
(bswap:DI (match_operand:DI 1 "indexed_or_indirect_operand" "")))
- (clobber (match_operand:SI 2 "gpc_reg_operand" ""))]
+ (clobber (match_operand:SI 2 "int_reg_operand" ""))]
"!TARGET_POWERPC64 && reload_completed"
[(const_int 0)]
"
Index: gcc/testsuite/gcc.target/powerpc/p9-xxbr-3.c
===================================================================
--- gcc/testsuite/gcc.target/powerpc/p9-xxbr-3.c (nonexistent)
+++ gcc/testsuite/gcc.target/powerpc/p9-xxbr-3.c (working copy)
@@ -0,0 +1,99 @@
+/* { dg-do compile { target { powerpc*-*-* && lp64 } } } */
+/* { dg-require-effective-target powerpc_p9vector_ok } */
+/* { dg-options "-mpower9-vector -O2" } */
+
+/* Verify that the XXBR{H,W} instructions are generated if the value is
+ forced to be in a vector register, and XXBRD is generated all of the
+ time for register bswap64's. */
+
+unsigned short
+do_bswap16_mem (unsigned short *p)
+{
+ return __builtin_bswap16 (*p); /* LHBRX. */
+}
+
+unsigned short
+do_bswap16_reg (unsigned short a)
+{
+ return __builtin_bswap16 (a); /* gpr sequences. */
+}
+
+void
+do_bswap16_store (unsigned short *p, unsigned short a)
+{
+ *p = __builtin_bswap16 (a); /* STHBRX. */
+}
+
+unsigned short
+do_bswap16_vect (unsigned short a)
+{
+ __asm__ (" # %x0" : "+v" (a));
+ return __builtin_bswap16 (a); /* XXBRW. */
+}
+
+unsigned int
+do_bswap32_mem (unsigned int *p)
+{
+ return __builtin_bswap32 (*p); /* LWBRX. */
+}
+
+unsigned int
+do_bswap32_reg (unsigned int a)
+{
+ return __builtin_bswap32 (a); /* gpr sequences. */
+}
+
+void
+do_bswap32_store (unsigned int *p, unsigned int a)
+{
+ *p = __builtin_bswap32 (a); /* STWBRX. */
+}
+
+unsigned int
+do_bswap32_vect (unsigned int a)
+{
+ __asm__ (" # %x0" : "+v" (a));
+ return __builtin_bswap32 (a); /* XXBRW. */
+}
+
+unsigned long
+do_bswap64_mem (unsigned long *p)
+{
+ return __builtin_bswap64 (*p); /* LDBRX. */
+}
+
+unsigned long
+do_bswap64_reg (unsigned long a)
+{
+ return __builtin_bswap64 (a); /* gpr sequences. */
+}
+
+void
+do_bswap64_store (unsigned long *p, unsigned int a)
+{
+ *p = __builtin_bswap64 (a); /* STDBRX. */
+}
+
+double
+do_bswap64_double (unsigned long a)
+{
+ return (double) __builtin_bswap64 (a); /* XXBRD. */
+}
+
+unsigned long
+do_bswap64_vect (unsigned long a)
+{
+ __asm__ (" # %x0" : "+v" (a)); /* XXBRD. */
+ return __builtin_bswap64 (a);
+}
+
+/* Make sure XXBR{H,W,D} is not generated by default. */
+/* { dg-final { scan-assembler-times "xxbrd" 3 } } */
+/* { dg-final { scan-assembler-times "xxbrh" 1 } } */
+/* { dg-final { scan-assembler-times "xxbrw" 1 } } */
+/* { dg-final { scan-assembler-times "ldbrx" 1 } } */
+/* { dg-final { scan-assembler-times "lhbrx" 1 } } */
+/* { dg-final { scan-assembler-times "lwbrx" 1 } } */
+/* { dg-final { scan-assembler-times "stdbrx" 1 } } */
+/* { dg-final { scan-assembler-times "sthbrx" 1 } } */
+/* { dg-final { scan-assembler-times "stwbrx" 1 } } */