Re: [PATCH], PR target/81593, Optimize PowerPC vector sets coming from a vector extracts

Michael Meissner Wed, 02 Aug 2017 07:30:01 -0700

On Fri, Jul 28, 2017 at 04:08:50PM -0500, Segher Boessenkool wrote:
> 
> "emit" is not a good name for this: that is generally used for something
> that does emit_insn, i.e. put an insn in the instruction stream.  This
> function returns a string a define_insn can return.  For the rl* insns
> I called the similar functions rs6000_insn_for_*, maybe something like
> that is better here?


...

> I think calling this with the rtx elementN args makes this only more
> complicated (the function comment doesn't say what they are or what
> NULL means, btw).

...

> In this and the other testcase, should you test no other insns at all
> are generated?

Here are the revised patches.  I tested on a little endian power8 system and a
big endian power7 system.  Are these patches ok for the trunk?

[gcc]
2017-08-02  Michael Meissner  <meiss...@linux.vnet.ibm.com>

        PR target/81593
        * config/rs6000/rs6000-protos.h (rs6000_output_xxpermdi): New
        declaration.
        * config/rs6000/rs6000.c (rs6000_output_xxpermdi): New function to
        emit XXPERMDI accessing either double word in either vector
        register inputs.
        * config/rs6000/vsx.md (vsx_concat_<mode>, VSX_D iterator):
        Rewrite VEC_CONCAT insn to call rs6000_output_xxpermdi.  Simplify
        the constraints with the removal of the -mupper-regs-* switches.
        (vsx_concat_<mode>_1): New combiner insns to optimize CONCATs
        where either register might have come from VEC_SELECT.
        (vsx_concat_<mode>_2): Likewise.
        (vsx_concat_<mode>_3): Likewise.
        (vsx_set_<mode>, VSX_D iterator): Rewrite insn to generate a
        VEC_CONCAT rather than use an UNSPEC to specify the option.

[gcc/testsuite]
2017-08-02  Michael Meissner  <meiss...@linux.vnet.ibm.com>

        PR target/81593
        * gcc.target/powerpc/vsx-extract-6.c: New test.
        * gcc.target/powerpc/vsx-extract-7.c: Likewise.


-- 
Michael Meissner, IBM
IBM, M/S 2506R, 550 King Street, Littleton, MA 01460-6245, USA
email: meiss...@linux.vnet.ibm.com, phone: +1 (978) 899-4797

Index: gcc/config/rs6000/rs6000-protos.h
===================================================================
--- gcc/config/rs6000/rs6000-protos.h   
(.../svn+ssh://meiss...@gcc.gnu.org/svn/gcc/trunk/gcc/config/rs6000)    
(revision 250793)
+++ gcc/config/rs6000/rs6000-protos.h   (.../gcc/config/rs6000) (working copy)
@@ -233,6 +233,7 @@ extern void rs6000_asm_output_dwarf_pcre
                                           const char *label);
 extern void rs6000_asm_output_dwarf_datarel (FILE *file, int size,
                                             const char *label);
+extern const char *rs6000_output_xxpermdi (rtx, rtx, rtx, rtx, rtx);
 
 /* Declare functions in rs6000-c.c */
 
Index: gcc/config/rs6000/rs6000.c
===================================================================
--- gcc/config/rs6000/rs6000.c  
(.../svn+ssh://meiss...@gcc.gnu.org/svn/gcc/trunk/gcc/config/rs6000)    
(revision 250793)
+++ gcc/config/rs6000/rs6000.c  (.../gcc/config/rs6000) (working copy)
@@ -39007,6 +39007,60 @@ rs6000_optab_supported_p (int op, machin
       return true;
     }
 }
+
+
+/* Output a xxpermdi instruction that sets a 128-bit vector DEST combining two
+   inputs SRC1 and SRC2.
+
+   If ELEMENT1 is null, use the top 64-bit double word of ARG1.  If it is
+   non-NULL, it is a 0 or 1 constant that gives the vector element number to
+   use for extracting the 64-bit double word from ARG1.
+
+   If ELEMENT2 is null, use the top 64-bit double word of ARG2.  If it is
+   non-NULL, it is a 0 or 1 constant that gives the vector element number to
+   use for extracting the 64-bit double word from ARG2.
+
+   The element number is based on the user element ordering, set by the
+   endianess and by the -maltivec={le,be} options.  */
+
+const char *
+rs6000_output_xxpermdi (rtx dest,
+                       rtx src1,
+                       rtx src2,
+                       rtx element1,
+                       rtx element2)
+{
+  int op1_dword = (!element1) ? 0 : INTVAL (element1);
+  int op2_dword = (!element2) ? 0 : INTVAL (element2);
+  rtx xops[10];
+  const char *insn_string;
+
+  gcc_assert (IN_RANGE (op1_dword | op2_dword, 0, 1));
+  xops[0] = dest;
+  xops[1] = src1;
+  xops[2] = src2;
+
+  if (BYTES_BIG_ENDIAN)
+    {
+      xops[3] = GEN_INT (2*op1_dword + op2_dword);
+      insn_string = "xxpermdi %x0,%x1,%x2,%3";
+    }
+  else
+    {
+      if (element1)
+       op1_dword = 1 - op1_dword;
+
+      if (element2)
+       op2_dword = 1 - op2_dword;
+
+      xops[3] = GEN_INT (op1_dword + 2*op2_dword);
+      insn_string = "xxpermdi %x0,%x2,%x1,%3";
+    }
+
+  output_asm_insn (insn_string, xops);
+  return "";
+}
+
 
 struct gcc_target targetm = TARGET_INITIALIZER;
 
Index: gcc/config/rs6000/vsx.md
===================================================================
--- gcc/config/rs6000/vsx.md    
(.../svn+ssh://meiss...@gcc.gnu.org/svn/gcc/trunk/gcc/config/rs6000)    
(revision 250793)
+++ gcc/config/rs6000/vsx.md    (.../gcc/config/rs6000) (working copy)
@@ -2364,19 +2364,18 @@ (define_insn "*vsx_float_fix_v2df2"
 
 ;; Build a V2DF/V2DI vector from two scalars
 (define_insn "vsx_concat_<mode>"
-  [(set (match_operand:VSX_D 0 "gpc_reg_operand" "=<VSa>,we")
+  [(set (match_operand:VSX_D 0 "vsx_register_operand" "=wa,we")
        (vec_concat:VSX_D
-        (match_operand:<VS_scalar> 1 "gpc_reg_operand" "<VS_64reg>,b")
-        (match_operand:<VS_scalar> 2 "gpc_reg_operand" "<VS_64reg>,b")))]
+        (match_operand:<VS_scalar> 1 "gpc_reg_operand" "wa,b")
+        (match_operand:<VS_scalar> 2 "gpc_reg_operand" "wa,b")))]
   "VECTOR_MEM_VSX_P (<MODE>mode)"
 {
   if (which_alternative == 0)
-    return (BYTES_BIG_ENDIAN
-           ? "xxpermdi %x0,%x1,%x2,0"
-           : "xxpermdi %x0,%x2,%x1,0");
+    return rs6000_output_xxpermdi (operands[0], operands[1], operands[2],
+                                  NULL_RTX, NULL_RTX);
 
   else if (which_alternative == 1)
-    return (BYTES_BIG_ENDIAN
+    return (VECTOR_ELT_ORDER_BIG
            ? "mtvsrdd %x0,%1,%2"
            : "mtvsrdd %x0,%2,%1");
 
@@ -2385,6 +2384,50 @@ (define_insn "vsx_concat_<mode>"
 }
   [(set_attr "type" "vecperm")])
 
+;; Combiner patterns to allow creating XXPERMDI's to access either double
+;; register in a vector register.  Note, rs6000_output_xxpermdi expects
+;; operands[0..2] to be the vector registers.
+(define_insn "*vsx_concat_<mode>_1"
+  [(set (match_operand:VSX_D 0 "vsx_register_operand" "=wa")
+       (vec_concat:VSX_D
+        (vec_select:<VS_scalar>
+         (match_operand:VSX_D 1 "gpc_reg_operand" "wa")
+         (parallel [(match_operand:QI 3 "const_0_to_1_operand" "n")]))
+        (match_operand:<VS_scalar> 2 "gpc_reg_operand" "wa")))]
+  "VECTOR_MEM_VSX_P (<MODE>mode)"
+{
+  return rs6000_output_xxpermdi (operands[0], operands[1], operands[2],
+                                operands[3], NULL_RTX);
+})
+
+(define_insn "*vsx_concat_<mode>_2"
+  [(set (match_operand:VSX_D 0 "vsx_register_operand" "=wa")
+       (vec_concat:VSX_D
+        (match_operand:<VS_scalar> 1 "gpc_reg_operand" "wa")
+        (vec_select:<VS_scalar>
+         (match_operand:VSX_D 2 "gpc_reg_operand" "wa")
+         (parallel [(match_operand:QI 3 "const_0_to_1_operand" "n")]))))]
+  "VECTOR_MEM_VSX_P (<MODE>mode)"
+{
+  return rs6000_output_xxpermdi (operands[0], operands[1], operands[2],
+                                NULL_RTX, operands[3]);
+})
+
+(define_insn "*vsx_concat_<mode>_3"
+  [(set (match_operand:VSX_D 0 "vsx_register_operand" "=wa")
+       (vec_concat:VSX_D
+        (vec_select:<VS_scalar>
+         (match_operand:VSX_D 1 "gpc_reg_operand" "wa")
+         (parallel [(match_operand:QI 3 "const_0_to_1_operand" "n")]))
+        (vec_select:<VS_scalar>
+         (match_operand:VSX_D 2 "gpc_reg_operand" "wa")
+         (parallel [(match_operand:QI 4 "const_0_to_1_operand" "n")]))))]
+  "VECTOR_MEM_VSX_P (<MODE>mode)"
+{
+  return rs6000_output_xxpermdi (operands[0], operands[1], operands[2],
+                                operands[3], operands[4]);
+})
+
 ;; Special purpose concat using xxpermdi to glue two single precision values
 ;; together, relying on the fact that internally scalar floats are represented
 ;; as doubles.  This is used to initialize a V4SF vector with 4 floats
@@ -2585,25 +2628,35 @@ (define_expand "vsx_set_v1ti"
   DONE;
 })
 
-;; Set the element of a V2DI/VD2F mode
-(define_insn "vsx_set_<mode>"
-  [(set (match_operand:VSX_D 0 "vsx_register_operand" "=wd,?<VSa>")
-       (unspec:VSX_D
-        [(match_operand:VSX_D 1 "vsx_register_operand" "wd,<VSa>")
-         (match_operand:<VS_scalar> 2 "vsx_register_operand" 
"<VS_64reg>,<VSa>")
-         (match_operand:QI 3 "u5bit_cint_operand" "i,i")]
-        UNSPEC_VSX_SET))]
+;; Rewrite V2DF/V2DI set in terms of VEC_CONCAT
+(define_expand "vsx_set_<mode>"
+  [(use (match_operand:VSX_D 0 "vsx_register_operand"))
+   (use (match_operand:VSX_D 1 "vsx_register_operand"))
+   (use (match_operand:<VS_scalar> 2 "gpc_reg_operand"))
+   (use (match_operand:QI 3 "const_0_to_1_operand"))]
   "VECTOR_MEM_VSX_P (<MODE>mode)"
 {
-  int idx_first = BYTES_BIG_ENDIAN ? 0 : 1;
-  if (INTVAL (operands[3]) == idx_first)
-    return \"xxpermdi %x0,%x2,%x1,1\";
-  else if (INTVAL (operands[3]) == 1 - idx_first)
-    return \"xxpermdi %x0,%x1,%x2,0\";
+  rtx dest = operands[0];
+  rtx vec_reg = operands[1];
+  rtx value = operands[2];
+  rtx ele = operands[3];
+  rtx tmp = gen_reg_rtx (<VS_scalar>mode);
+
+  if (ele == const0_rtx)
+    {
+      emit_insn (gen_vsx_extract_<mode> (tmp, vec_reg, const1_rtx));
+      emit_insn (gen_vsx_concat_<mode> (dest, value, tmp));
+      DONE;
+    }
+  else if (ele == const1_rtx)
+    {
+      emit_insn (gen_vsx_extract_<mode> (tmp, vec_reg, const0_rtx));
+      emit_insn (gen_vsx_concat_<mode> (dest, tmp, value));
+      DONE;
+    }
   else
     gcc_unreachable ();
-}
-  [(set_attr "type" "vecperm")])
+})
 
 ;; Extract a DF/DI element from V2DF/V2DI
 ;; Optimize cases were we can do a simple or direct move.
Index: gcc/testsuite/gcc.target/powerpc/vsx-extract-6.c
===================================================================
--- gcc/testsuite/gcc.target/powerpc/vsx-extract-6.c    
(.../svn+ssh://meiss...@gcc.gnu.org/svn/gcc/trunk/gcc/testsuite/gcc.target/powerpc)
     (revision 0)
+++ gcc/testsuite/gcc.target/powerpc/vsx-extract-6.c    
(.../gcc/testsuite/gcc.target/powerpc)  (revision 250804)
@@ -0,0 +1,25 @@
+/* { dg-do compile { target { powerpc*-*-* && lp64 } } } */
+/* { dg-skip-if "" { powerpc*-*-darwin* } } */
+/* { dg-require-effective-target powerpc_vsx_ok } */
+/* { dg-options "-O2 -mvsx" } */
+
+vector unsigned long
+test_vpasted (vector unsigned long high, vector unsigned long low)
+{
+  vector unsigned long res;
+  res[1] = high[1];
+  res[0] = low[0];
+  return res;
+}
+
+/* { dg-final { scan-assembler-times {\mxxpermdi\M} 1    } } */
+/* { dg-final { scan-assembler-not   {\mvspltisw\M}      } } */
+/* { dg-final { scan-assembler-not   {\mxxlor\M}         } } */
+/* { dg-final { scan-assembler-not   {\mxxlxor\M}        } } */
+/* { dg-final { scan-assembler-not   {\mxxspltib\M}      } } */
+/* { dg-final { scan-assembler-not   {\mlxvx?\M}         } } */
+/* { dg-final { scan-assembler-not   {\mlxv[dw][24]x\M}  } } */
+/* { dg-final { scan-assembler-not   {\mlvx\M}           } } */
+/* { dg-final { scan-assembler-not   {\mstxvx?\M}        } } */
+/* { dg-final { scan-assembler-not   {\mstxv[dw][24]x\M} } } */
+/* { dg-final { scan-assembler-not   {\mstvx\M}          } } */
Index: gcc/testsuite/gcc.target/powerpc/vsx-extract-7.c
===================================================================
--- gcc/testsuite/gcc.target/powerpc/vsx-extract-7.c    
(.../svn+ssh://meiss...@gcc.gnu.org/svn/gcc/trunk/gcc/testsuite/gcc.target/powerpc)
     (revision 0)
+++ gcc/testsuite/gcc.target/powerpc/vsx-extract-7.c    
(.../gcc/testsuite/gcc.target/powerpc)  (revision 250804)
@@ -0,0 +1,25 @@
+/* { dg-do compile { target { powerpc*-*-* } } } */
+/* { dg-skip-if "" { powerpc*-*-darwin* } } */
+/* { dg-require-effective-target powerpc_vsx_ok } */
+/* { dg-options "-O2 -mvsx" } */
+
+vector double
+test_vpasted (vector double high, vector double low)
+{
+  vector double res;
+  res[1] = high[1];
+  res[0] = low[0];
+  return res;
+}
+
+/* { dg-final { scan-assembler-times {\mxxpermdi\M} 1    } } */
+/* { dg-final { scan-assembler-not   {\mvspltisw\M}      } } */
+/* { dg-final { scan-assembler-not   {\mxxlor\M}         } } */
+/* { dg-final { scan-assembler-not   {\mxxlxor\M}        } } */
+/* { dg-final { scan-assembler-not   {\mxxspltib\M}      } } */
+/* { dg-final { scan-assembler-not   {\mlxvx?\M}         } } */
+/* { dg-final { scan-assembler-not   {\mlxv[dw][24]x\M}  } } */
+/* { dg-final { scan-assembler-not   {\mlvx\M}           } } */
+/* { dg-final { scan-assembler-not   {\mstxvx?\M}        } } */
+/* { dg-final { scan-assembler-not   {\mstxv[dw][24]x\M} } } */
+/* { dg-final { scan-assembler-not   {\mstvx\M}          } } */

Re: [PATCH], PR target/81593, Optimize PowerPC vector sets coming from a vector extracts

Reply via email to