[PATCH], PR target/81593, Optimize PowerPC vector sets coming from a vector extracts

Michael Meissner Thu, 27 Jul 2017 16:21:53 -0700

This patches optimizes the PowerPC vector set operation for 64-bit doubles and
longs where the elements in the vector set may have been extracted from another
vector (PR target/81593):


Here an an example:

        vector double
        test_vpasted (vector double high, vector double low)
        {
          vector double res;
          res[1] = high[1];
          res[0] = low[0];
          return res;
        }

Previously it would generate:

        xxpermdi 12,34,34,2
        vspltisw 2,0
        xxlor 0,35,35
        xxpermdi 34,34,12,0
        xxpermdi 34,0,34,1

and with these patches, it now generates:

        xxpermdi 34,35,34,1

I have tested it on a little endian power8 system and a big endian power7
system with the usual bootstrap and make checks with no regressions.  Can I
check this into the trunk?

I also built Spec 2006 with the compiler, and saw no changes in the code
generated.  This isn't surprising because it isn't something that auto
vectorization might generate by default.

[gcc]
2017-07-27  Michael Meissner  <[email protected]>

        PR target/81593
        * config/rs6000/rs6000-protos.h (rs6000_emit_xxpermdi): New
        declaration.
        * config/rs6000/rs6000.c (rs6000_emit_xxpermdi): New function to
        emit XXPERMDI accessing either double word in either vector
        register inputs.
        * config/rs6000/vsx.md (vsx_concat_<mode>, VSX_D iterator):
        Rewrite VEC_CONCAT insn to call rs6000_emit_xxpermdi.  Simplify
        the constraints with the removal of the -mupper-regs-* switches.
        (vsx_concat_<mode>_1): New combiner insns to optimize CONCATs
        where either register might have come from VEC_SELECT.
        (vsx_concat_<mode>_2): Likewise.
        (vsx_concat_<mode>_3): Likewise.
        (vsx_set_<mode>, VSX_D iterator): Rewrite insn to generate a
        VEC_CONCAT rather than use an UNSPEC to specify the option.

[gcc/testsuite]
2017-07-27  Michael Meissner  <[email protected]>

        PR target/81593
        * gcc.target/powerpc/vsx-extract-6.c: New test.
        * gcc.target/powerpc/vsx-extract-7.c: Likewise.

-- 
Michael Meissner, IBM
IBM, M/S 2506R, 550 King Street, Littleton, MA 01460-6245, USA
email: [email protected], phone: +1 (978) 899-4797

Index: gcc/config/rs6000/rs6000-protos.h
===================================================================
--- gcc/config/rs6000/rs6000-protos.h   
(svn+ssh://[email protected]/svn/gcc/trunk/gcc/config/rs6000/rs6000-protos.h)
        (revision 250577)
+++ gcc/config/rs6000/rs6000-protos.h   (.../gcc/config/rs6000/rs6000-protos.h) 
(working copy)
@@ -233,6 +233,7 @@ extern void rs6000_asm_output_dwarf_pcre
                                           const char *label);
 extern void rs6000_asm_output_dwarf_datarel (FILE *file, int size,
                                             const char *label);
+extern const char *rs6000_emit_xxpermdi (rtx[], rtx, rtx);
 
 /* Declare functions in rs6000-c.c */
 
Index: gcc/config/rs6000/rs6000.c
===================================================================
--- gcc/config/rs6000/rs6000.c  
(svn+ssh://[email protected]/svn/gcc/trunk/gcc/config/rs6000/rs6000.c)       
(revision 250577)
+++ gcc/config/rs6000/rs6000.c  (.../gcc/config/rs6000/rs6000.c)        
(working copy)
@@ -39167,6 +39167,38 @@ rs6000_optab_supported_p (int op, machin
       return true;
     }
 }
+
+
+/* Emit a XXPERMDI instruction that can extract from either double word of the
+   two arguments.  ELEMENT1 and ELEMENT2 are either NULL or they are 0/1 giving
+   which double word to be used for the operand.  */
+
+const char *
+rs6000_emit_xxpermdi (rtx operands[], rtx element1, rtx element2)
+{
+  int op1_dword = (!element1) ? 0 : INTVAL (element1);
+  int op2_dword = (!element2) ? 0 : INTVAL (element2);
+
+  gcc_assert (IN_RANGE (op1_dword | op2_dword, 0, 1));
+
+  if (BYTES_BIG_ENDIAN)
+    {
+      operands[3] = GEN_INT (2*op1_dword + op2_dword);
+      return "xxpermdi %x0,%x1,%x2,%3";
+    }
+  else
+    {
+      if (element1)
+       op1_dword = 1 - op1_dword;
+
+      if (element2)
+       op2_dword = 1 - op2_dword;
+
+      operands[3] = GEN_INT (op1_dword + 2*op2_dword);
+      return "xxpermdi %x0,%x2,%x1,%3";
+    }
+}
+
 
 struct gcc_target targetm = TARGET_INITIALIZER;
 
Index: gcc/config/rs6000/vsx.md
===================================================================
--- gcc/config/rs6000/vsx.md    
(svn+ssh://[email protected]/svn/gcc/trunk/gcc/config/rs6000/vsx.md) 
(revision 250577)
+++ gcc/config/rs6000/vsx.md    (.../gcc/config/rs6000/vsx.md)  (working copy)
@@ -2366,19 +2366,17 @@ (define_insn "*vsx_float_fix_v2df2"
 
 ;; Build a V2DF/V2DI vector from two scalars
 (define_insn "vsx_concat_<mode>"
-  [(set (match_operand:VSX_D 0 "gpc_reg_operand" "=<VSa>,we")
+  [(set (match_operand:VSX_D 0 "vsx_register_operand" "=wa,we")
        (vec_concat:VSX_D
-        (match_operand:<VS_scalar> 1 "gpc_reg_operand" "<VS_64reg>,b")
-        (match_operand:<VS_scalar> 2 "gpc_reg_operand" "<VS_64reg>,b")))]
+        (match_operand:<VS_scalar> 1 "gpc_reg_operand" "wa,b")
+        (match_operand:<VS_scalar> 2 "gpc_reg_operand" "wa,b")))]
   "VECTOR_MEM_VSX_P (<MODE>mode)"
 {
   if (which_alternative == 0)
-    return (BYTES_BIG_ENDIAN
-           ? "xxpermdi %x0,%x1,%x2,0"
-           : "xxpermdi %x0,%x2,%x1,0");
+    return rs6000_emit_xxpermdi (operands, NULL_RTX, NULL_RTX);
 
   else if (which_alternative == 1)
-    return (BYTES_BIG_ENDIAN
+    return (VECTOR_ELT_ORDER_BIG
            ? "mtvsrdd %x0,%1,%2"
            : "mtvsrdd %x0,%2,%1");
 
@@ -2387,6 +2385,47 @@ (define_insn "vsx_concat_<mode>"
 }
   [(set_attr "type" "vecperm")])
 
+;; Combiner patterns to allow creating XXPERMDI's to access either double
+;; register in a vector register.  Note, rs6000_emit_xxpermdi expects
+;; operands[0..2] to be the vector registers.
+(define_insn "*vsx_concat_<mode>_1"
+  [(set (match_operand:VSX_D 0 "vsx_register_operand" "=wa")
+       (vec_concat:VSX_D
+        (vec_select:<VS_scalar>
+         (match_operand:VSX_D 1 "gpc_reg_operand" "wa")
+         (parallel [(match_operand:QI 3 "const_0_to_1_operand" "n")]))
+        (match_operand:<VS_scalar> 2 "gpc_reg_operand" "wa")))]
+  "VECTOR_MEM_VSX_P (<MODE>mode)"
+{
+  return rs6000_emit_xxpermdi (operands, operands[3], NULL_RTX);
+})
+
+(define_insn "*vsx_concat_<mode>_2"
+  [(set (match_operand:VSX_D 0 "vsx_register_operand" "=wa")
+       (vec_concat:VSX_D
+        (match_operand:<VS_scalar> 1 "gpc_reg_operand" "wa")
+        (vec_select:<VS_scalar>
+         (match_operand:VSX_D 2 "gpc_reg_operand" "wa")
+         (parallel [(match_operand:QI 3 "const_0_to_1_operand" "n")]))))]
+  "VECTOR_MEM_VSX_P (<MODE>mode)"
+{
+  return rs6000_emit_xxpermdi (operands, NULL_RTX, operands[3]);
+})
+
+(define_insn "*vsx_concat_<mode>_3"
+  [(set (match_operand:VSX_D 0 "vsx_register_operand" "=wa")
+       (vec_concat:VSX_D
+        (vec_select:<VS_scalar>
+         (match_operand:VSX_D 1 "gpc_reg_operand" "wa")
+         (parallel [(match_operand:QI 3 "const_0_to_1_operand" "n")]))
+        (vec_select:<VS_scalar>
+         (match_operand:VSX_D 2 "gpc_reg_operand" "wa")
+         (parallel [(match_operand:QI 4 "const_0_to_1_operand" "n")]))))]
+  "VECTOR_MEM_VSX_P (<MODE>mode)"
+{
+  return rs6000_emit_xxpermdi (operands, operands[3], operands[4]);
+})
+
 ;; Special purpose concat using xxpermdi to glue two single precision values
 ;; together, relying on the fact that internally scalar floats are represented
 ;; as doubles.  This is used to initialize a V4SF vector with 4 floats
@@ -2587,25 +2626,35 @@ (define_expand "vsx_set_v1ti"
   DONE;
 })
 
-;; Set the element of a V2DI/VD2F mode
-(define_insn "vsx_set_<mode>"
-  [(set (match_operand:VSX_D 0 "vsx_register_operand" "=wd,?<VSa>")
-       (unspec:VSX_D
-        [(match_operand:VSX_D 1 "vsx_register_operand" "wd,<VSa>")
-         (match_operand:<VS_scalar> 2 "vsx_register_operand" 
"<VS_64reg>,<VSa>")
-         (match_operand:QI 3 "u5bit_cint_operand" "i,i")]
-        UNSPEC_VSX_SET))]
+;; Rewrite V2DF/V2DI set in terms of VEC_CONCAT
+(define_expand "vsx_set_<mode>"
+  [(use (match_operand:VSX_D 0 "vsx_register_operand"))
+   (use (match_operand:VSX_D 1 "vsx_register_operand"))
+   (use (match_operand:<VS_scalar> 2 "gpc_reg_operand"))
+   (use (match_operand:QI 3 "const_0_to_1_operand"))]
   "VECTOR_MEM_VSX_P (<MODE>mode)"
 {
-  int idx_first = BYTES_BIG_ENDIAN ? 0 : 1;
-  if (INTVAL (operands[3]) == idx_first)
-    return \"xxpermdi %x0,%x2,%x1,1\";
-  else if (INTVAL (operands[3]) == 1 - idx_first)
-    return \"xxpermdi %x0,%x1,%x2,0\";
+  rtx dest = operands[0];
+  rtx vec_reg = operands[1];
+  rtx value = operands[2];
+  rtx ele = operands[3];
+  rtx tmp = gen_reg_rtx (<VS_scalar>mode);
+
+  if (ele == const0_rtx)
+    {
+      emit_insn (gen_vsx_extract_<mode> (tmp, vec_reg, const1_rtx));
+      emit_insn (gen_vsx_concat_<mode> (dest, value, tmp));
+      DONE;
+    }
+  else if (ele == const1_rtx)
+    {
+      emit_insn (gen_vsx_extract_<mode> (tmp, vec_reg, const0_rtx));
+      emit_insn (gen_vsx_concat_<mode> (dest, tmp, value));
+      DONE;
+    }
   else
     gcc_unreachable ();
-}
-  [(set_attr "type" "vecperm")])
+})
 
 ;; Extract a DF/DI element from V2DF/V2DI
 ;; Optimize cases were we can do a simple or direct move.
Index: gcc/testsuite/gcc.target/powerpc/vsx-extract-6.c
===================================================================
--- gcc/testsuite/gcc.target/powerpc/vsx-extract-6.c    
(svn+ssh://[email protected]/svn/gcc/trunk/gcc/testsuite/gcc.target/powerpc/vsx-extract-6.c)
 (revision 0)
+++ gcc/testsuite/gcc.target/powerpc/vsx-extract-6.c    
(.../gcc/testsuite/gcc.target/powerpc/vsx-extract-6.c)  (revision 250640)
@@ -0,0 +1,15 @@
+/* { dg-do compile { target { powerpc*-*-* && lp64 } } } */
+/* { dg-skip-if "" { powerpc*-*-darwin* } } */
+/* { dg-require-effective-target powerpc_vsx_ok } */
+/* { dg-options "-O2 -mvsx" } */
+
+vector unsigned long
+test_vpasted (vector unsigned long high, vector unsigned long low)
+{
+  vector unsigned long res;
+  res[1] = high[1];
+  res[0] = low[0];
+  return res;
+}
+
+/* { dg-final { scan-assembler-times {\mxxpermdi\M} 1 } } */
Index: gcc/testsuite/gcc.target/powerpc/vsx-extract-7.c
===================================================================
--- gcc/testsuite/gcc.target/powerpc/vsx-extract-7.c    
(svn+ssh://[email protected]/svn/gcc/trunk/gcc/testsuite/gcc.target/powerpc/vsx-extract-7.c)
 (revision 0)
+++ gcc/testsuite/gcc.target/powerpc/vsx-extract-7.c    
(.../gcc/testsuite/gcc.target/powerpc/vsx-extract-7.c)  (revision 250640)
@@ -0,0 +1,15 @@
+/* { dg-do compile { target { powerpc*-*-* } } } */
+/* { dg-skip-if "" { powerpc*-*-darwin* } } */
+/* { dg-require-effective-target powerpc_vsx_ok } */
+/* { dg-options "-O2 -mvsx" } */
+
+vector double
+test_vpasted (vector double high, vector double low)
+{
+  vector double res;
+  res[1] = high[1];
+  res[0] = low[0];
+  return res;
+}
+
+/* { dg-final { scan-assembler-times {\mxxpermdi\M} 1 } } */

[PATCH], PR target/81593, Optimize PowerPC vector sets coming from a vector extracts

Reply via email to