This patches optimizes the PowerPC vector set operation for 64-bit doubles and longs where the elements in the vector set may have been extracted from another vector (PR target/81593):
Here an an example: vector double test_vpasted (vector double high, vector double low) { vector double res; res[1] = high[1]; res[0] = low[0]; return res; } Previously it would generate: xxpermdi 12,34,34,2 vspltisw 2,0 xxlor 0,35,35 xxpermdi 34,34,12,0 xxpermdi 34,0,34,1 and with these patches, it now generates: xxpermdi 34,35,34,1 I have tested it on a little endian power8 system and a big endian power7 system with the usual bootstrap and make checks with no regressions. Can I check this into the trunk? I also built Spec 2006 with the compiler, and saw no changes in the code generated. This isn't surprising because it isn't something that auto vectorization might generate by default. [gcc] 2017-07-27 Michael Meissner <meiss...@linux.vnet.ibm.com> PR target/81593 * config/rs6000/rs6000-protos.h (rs6000_emit_xxpermdi): New declaration. * config/rs6000/rs6000.c (rs6000_emit_xxpermdi): New function to emit XXPERMDI accessing either double word in either vector register inputs. * config/rs6000/vsx.md (vsx_concat_<mode>, VSX_D iterator): Rewrite VEC_CONCAT insn to call rs6000_emit_xxpermdi. Simplify the constraints with the removal of the -mupper-regs-* switches. (vsx_concat_<mode>_1): New combiner insns to optimize CONCATs where either register might have come from VEC_SELECT. (vsx_concat_<mode>_2): Likewise. (vsx_concat_<mode>_3): Likewise. (vsx_set_<mode>, VSX_D iterator): Rewrite insn to generate a VEC_CONCAT rather than use an UNSPEC to specify the option. [gcc/testsuite] 2017-07-27 Michael Meissner <meiss...@linux.vnet.ibm.com> PR target/81593 * gcc.target/powerpc/vsx-extract-6.c: New test. * gcc.target/powerpc/vsx-extract-7.c: Likewise. -- Michael Meissner, IBM IBM, M/S 2506R, 550 King Street, Littleton, MA 01460-6245, USA email: meiss...@linux.vnet.ibm.com, phone: +1 (978) 899-4797
Index: gcc/config/rs6000/rs6000-protos.h =================================================================== --- gcc/config/rs6000/rs6000-protos.h (svn+ssh://meiss...@gcc.gnu.org/svn/gcc/trunk/gcc/config/rs6000/rs6000-protos.h) (revision 250577) +++ gcc/config/rs6000/rs6000-protos.h (.../gcc/config/rs6000/rs6000-protos.h) (working copy) @@ -233,6 +233,7 @@ extern void rs6000_asm_output_dwarf_pcre const char *label); extern void rs6000_asm_output_dwarf_datarel (FILE *file, int size, const char *label); +extern const char *rs6000_emit_xxpermdi (rtx[], rtx, rtx); /* Declare functions in rs6000-c.c */ Index: gcc/config/rs6000/rs6000.c =================================================================== --- gcc/config/rs6000/rs6000.c (svn+ssh://meiss...@gcc.gnu.org/svn/gcc/trunk/gcc/config/rs6000/rs6000.c) (revision 250577) +++ gcc/config/rs6000/rs6000.c (.../gcc/config/rs6000/rs6000.c) (working copy) @@ -39167,6 +39167,38 @@ rs6000_optab_supported_p (int op, machin return true; } } + + +/* Emit a XXPERMDI instruction that can extract from either double word of the + two arguments. ELEMENT1 and ELEMENT2 are either NULL or they are 0/1 giving + which double word to be used for the operand. */ + +const char * +rs6000_emit_xxpermdi (rtx operands[], rtx element1, rtx element2) +{ + int op1_dword = (!element1) ? 0 : INTVAL (element1); + int op2_dword = (!element2) ? 0 : INTVAL (element2); + + gcc_assert (IN_RANGE (op1_dword | op2_dword, 0, 1)); + + if (BYTES_BIG_ENDIAN) + { + operands[3] = GEN_INT (2*op1_dword + op2_dword); + return "xxpermdi %x0,%x1,%x2,%3"; + } + else + { + if (element1) + op1_dword = 1 - op1_dword; + + if (element2) + op2_dword = 1 - op2_dword; + + operands[3] = GEN_INT (op1_dword + 2*op2_dword); + return "xxpermdi %x0,%x2,%x1,%3"; + } +} + struct gcc_target targetm = TARGET_INITIALIZER; Index: gcc/config/rs6000/vsx.md =================================================================== --- gcc/config/rs6000/vsx.md (svn+ssh://meiss...@gcc.gnu.org/svn/gcc/trunk/gcc/config/rs6000/vsx.md) (revision 250577) +++ gcc/config/rs6000/vsx.md (.../gcc/config/rs6000/vsx.md) (working copy) @@ -2366,19 +2366,17 @@ (define_insn "*vsx_float_fix_v2df2" ;; Build a V2DF/V2DI vector from two scalars (define_insn "vsx_concat_<mode>" - [(set (match_operand:VSX_D 0 "gpc_reg_operand" "=<VSa>,we") + [(set (match_operand:VSX_D 0 "vsx_register_operand" "=wa,we") (vec_concat:VSX_D - (match_operand:<VS_scalar> 1 "gpc_reg_operand" "<VS_64reg>,b") - (match_operand:<VS_scalar> 2 "gpc_reg_operand" "<VS_64reg>,b")))] + (match_operand:<VS_scalar> 1 "gpc_reg_operand" "wa,b") + (match_operand:<VS_scalar> 2 "gpc_reg_operand" "wa,b")))] "VECTOR_MEM_VSX_P (<MODE>mode)" { if (which_alternative == 0) - return (BYTES_BIG_ENDIAN - ? "xxpermdi %x0,%x1,%x2,0" - : "xxpermdi %x0,%x2,%x1,0"); + return rs6000_emit_xxpermdi (operands, NULL_RTX, NULL_RTX); else if (which_alternative == 1) - return (BYTES_BIG_ENDIAN + return (VECTOR_ELT_ORDER_BIG ? "mtvsrdd %x0,%1,%2" : "mtvsrdd %x0,%2,%1"); @@ -2387,6 +2385,47 @@ (define_insn "vsx_concat_<mode>" } [(set_attr "type" "vecperm")]) +;; Combiner patterns to allow creating XXPERMDI's to access either double +;; register in a vector register. Note, rs6000_emit_xxpermdi expects +;; operands[0..2] to be the vector registers. +(define_insn "*vsx_concat_<mode>_1" + [(set (match_operand:VSX_D 0 "vsx_register_operand" "=wa") + (vec_concat:VSX_D + (vec_select:<VS_scalar> + (match_operand:VSX_D 1 "gpc_reg_operand" "wa") + (parallel [(match_operand:QI 3 "const_0_to_1_operand" "n")])) + (match_operand:<VS_scalar> 2 "gpc_reg_operand" "wa")))] + "VECTOR_MEM_VSX_P (<MODE>mode)" +{ + return rs6000_emit_xxpermdi (operands, operands[3], NULL_RTX); +}) + +(define_insn "*vsx_concat_<mode>_2" + [(set (match_operand:VSX_D 0 "vsx_register_operand" "=wa") + (vec_concat:VSX_D + (match_operand:<VS_scalar> 1 "gpc_reg_operand" "wa") + (vec_select:<VS_scalar> + (match_operand:VSX_D 2 "gpc_reg_operand" "wa") + (parallel [(match_operand:QI 3 "const_0_to_1_operand" "n")]))))] + "VECTOR_MEM_VSX_P (<MODE>mode)" +{ + return rs6000_emit_xxpermdi (operands, NULL_RTX, operands[3]); +}) + +(define_insn "*vsx_concat_<mode>_3" + [(set (match_operand:VSX_D 0 "vsx_register_operand" "=wa") + (vec_concat:VSX_D + (vec_select:<VS_scalar> + (match_operand:VSX_D 1 "gpc_reg_operand" "wa") + (parallel [(match_operand:QI 3 "const_0_to_1_operand" "n")])) + (vec_select:<VS_scalar> + (match_operand:VSX_D 2 "gpc_reg_operand" "wa") + (parallel [(match_operand:QI 4 "const_0_to_1_operand" "n")]))))] + "VECTOR_MEM_VSX_P (<MODE>mode)" +{ + return rs6000_emit_xxpermdi (operands, operands[3], operands[4]); +}) + ;; Special purpose concat using xxpermdi to glue two single precision values ;; together, relying on the fact that internally scalar floats are represented ;; as doubles. This is used to initialize a V4SF vector with 4 floats @@ -2587,25 +2626,35 @@ (define_expand "vsx_set_v1ti" DONE; }) -;; Set the element of a V2DI/VD2F mode -(define_insn "vsx_set_<mode>" - [(set (match_operand:VSX_D 0 "vsx_register_operand" "=wd,?<VSa>") - (unspec:VSX_D - [(match_operand:VSX_D 1 "vsx_register_operand" "wd,<VSa>") - (match_operand:<VS_scalar> 2 "vsx_register_operand" "<VS_64reg>,<VSa>") - (match_operand:QI 3 "u5bit_cint_operand" "i,i")] - UNSPEC_VSX_SET))] +;; Rewrite V2DF/V2DI set in terms of VEC_CONCAT +(define_expand "vsx_set_<mode>" + [(use (match_operand:VSX_D 0 "vsx_register_operand")) + (use (match_operand:VSX_D 1 "vsx_register_operand")) + (use (match_operand:<VS_scalar> 2 "gpc_reg_operand")) + (use (match_operand:QI 3 "const_0_to_1_operand"))] "VECTOR_MEM_VSX_P (<MODE>mode)" { - int idx_first = BYTES_BIG_ENDIAN ? 0 : 1; - if (INTVAL (operands[3]) == idx_first) - return \"xxpermdi %x0,%x2,%x1,1\"; - else if (INTVAL (operands[3]) == 1 - idx_first) - return \"xxpermdi %x0,%x1,%x2,0\"; + rtx dest = operands[0]; + rtx vec_reg = operands[1]; + rtx value = operands[2]; + rtx ele = operands[3]; + rtx tmp = gen_reg_rtx (<VS_scalar>mode); + + if (ele == const0_rtx) + { + emit_insn (gen_vsx_extract_<mode> (tmp, vec_reg, const1_rtx)); + emit_insn (gen_vsx_concat_<mode> (dest, value, tmp)); + DONE; + } + else if (ele == const1_rtx) + { + emit_insn (gen_vsx_extract_<mode> (tmp, vec_reg, const0_rtx)); + emit_insn (gen_vsx_concat_<mode> (dest, tmp, value)); + DONE; + } else gcc_unreachable (); -} - [(set_attr "type" "vecperm")]) +}) ;; Extract a DF/DI element from V2DF/V2DI ;; Optimize cases were we can do a simple or direct move. Index: gcc/testsuite/gcc.target/powerpc/vsx-extract-6.c =================================================================== --- gcc/testsuite/gcc.target/powerpc/vsx-extract-6.c (svn+ssh://meiss...@gcc.gnu.org/svn/gcc/trunk/gcc/testsuite/gcc.target/powerpc/vsx-extract-6.c) (revision 0) +++ gcc/testsuite/gcc.target/powerpc/vsx-extract-6.c (.../gcc/testsuite/gcc.target/powerpc/vsx-extract-6.c) (revision 250640) @@ -0,0 +1,15 @@ +/* { dg-do compile { target { powerpc*-*-* && lp64 } } } */ +/* { dg-skip-if "" { powerpc*-*-darwin* } } */ +/* { dg-require-effective-target powerpc_vsx_ok } */ +/* { dg-options "-O2 -mvsx" } */ + +vector unsigned long +test_vpasted (vector unsigned long high, vector unsigned long low) +{ + vector unsigned long res; + res[1] = high[1]; + res[0] = low[0]; + return res; +} + +/* { dg-final { scan-assembler-times {\mxxpermdi\M} 1 } } */ Index: gcc/testsuite/gcc.target/powerpc/vsx-extract-7.c =================================================================== --- gcc/testsuite/gcc.target/powerpc/vsx-extract-7.c (svn+ssh://meiss...@gcc.gnu.org/svn/gcc/trunk/gcc/testsuite/gcc.target/powerpc/vsx-extract-7.c) (revision 0) +++ gcc/testsuite/gcc.target/powerpc/vsx-extract-7.c (.../gcc/testsuite/gcc.target/powerpc/vsx-extract-7.c) (revision 250640) @@ -0,0 +1,15 @@ +/* { dg-do compile { target { powerpc*-*-* } } } */ +/* { dg-skip-if "" { powerpc*-*-darwin* } } */ +/* { dg-require-effective-target powerpc_vsx_ok } */ +/* { dg-options "-O2 -mvsx" } */ + +vector double +test_vpasted (vector double high, vector double low) +{ + vector double res; + res[1] = high[1]; + res[0] = low[0]; + return res; +} + +/* { dg-final { scan-assembler-times {\mxxpermdi\M} 1 } } */