s390 missed constant vector permutation cases based on the vector pack instruction or changing the size of the vector elements during vector merge. This enables some more patterns that do not need to load a constant vector for permutation.
Bootstrapped and regtested on s390. Okay for trunk? gcc/ChangeLog: * config/s390/s390.cc (expand_perm_with_merge): Add size change cases. (expand_perm_with_pack): New function. (vectorize_vec_perm_const_1): Wire up new function. gcc/testsuite/ChangeLog: * gcc.target/s390/vector/vec-perm-merge-1.c: New test. * gcc.target/s390/vector/vec-perm-pack-1.c: New test. Signed-off-by: Juergen Christ <jchr...@linux.ibm.com> --- gcc/config/s390/s390.cc | 169 +++++++++++- .../gcc.target/s390/vector/vec-perm-merge-1.c | 242 ++++++++++++++++++ .../gcc.target/s390/vector/vec-perm-pack-1.c | 133 ++++++++++ 3 files changed, 542 insertions(+), 2 deletions(-) create mode 100644 gcc/testsuite/gcc.target/s390/vector/vec-perm-merge-1.c create mode 100644 gcc/testsuite/gcc.target/s390/vector/vec-perm-pack-1.c diff --git a/gcc/config/s390/s390.cc b/gcc/config/s390/s390.cc index 38267202f668..de9c15c7bd42 100644 --- a/gcc/config/s390/s390.cc +++ b/gcc/config/s390/s390.cc @@ -18041,9 +18041,34 @@ expand_perm_with_merge (const struct expand_vec_perm_d &d) static const unsigned char lo_perm_qi_swap[16] = {17, 1, 19, 3, 21, 5, 23, 7, 25, 9, 27, 11, 29, 13, 31, 15}; + static const unsigned char hi_perm_qi_di[16] + = {0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23}; + static const unsigned char hi_perm_qi_si[16] + = {0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23}; + static const unsigned char hi_perm_qi_hi[16] + = {0, 1, 16, 17, 2, 3, 18, 19, 4, 5, 20, 21, 6, 7, 22, 23}; + + static const unsigned char lo_perm_qi_di[16] + = {8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31}; + static const unsigned char lo_perm_qi_si[16] + = {8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31}; + static const unsigned char lo_perm_qi_hi[16] + = {8, 9, 24, 25, 10, 11, 26, 27, 12, 13, 28, 29, 14, 15, 30, 31}; + + static const unsigned char hi_perm_hi_si[8] = {0, 1, 8, 9, 2, 3, 10, 11}; + static const unsigned char hi_perm_hi_di[8] = {0, 1, 2, 3, 8, 9, 10, 11}; + + static const unsigned char lo_perm_hi_si[8] = {4, 5, 12, 13, 6, 7, 14, 15}; + static const unsigned char lo_perm_hi_di[8] = {4, 5, 6, 7, 12, 13, 14, 15}; + + static const unsigned char hi_perm_si_di[4] = {0, 1, 4, 5}; + + static const unsigned char lo_perm_si_di[4] = {2, 3, 6, 7}; + bool merge_lo_p = false; bool merge_hi_p = false; bool swap_operands_p = false; + machine_mode mergemode = d.vmode; if ((d.nelt == 2 && memcmp (d.perm, hi_perm_di, 2) == 0) || (d.nelt == 4 && memcmp (d.perm, hi_perm_si, 4) == 0) @@ -18075,6 +18100,75 @@ expand_perm_with_merge (const struct expand_vec_perm_d &d) merge_lo_p = true; swap_operands_p = true; } + else if (d.nelt == 16) + { + if (memcmp (d.perm, hi_perm_qi_di, 16) == 0) + { + merge_hi_p = true; + mergemode = E_V2DImode; + } + else if (memcmp (d.perm, hi_perm_qi_si, 16) == 0) + { + merge_hi_p = true; + mergemode = E_V4SImode; + } + else if (memcmp (d.perm, hi_perm_qi_hi, 16) == 0) + { + merge_hi_p = true; + mergemode = E_V8HImode; + } + else if (memcmp (d.perm, lo_perm_qi_di, 16) == 0) + { + merge_lo_p = true; + mergemode = E_V2DImode; + } + else if (memcmp (d.perm, lo_perm_qi_si, 16) == 0) + { + merge_lo_p = true; + mergemode = E_V4SImode; + } + else if (memcmp (d.perm, lo_perm_qi_hi, 16) == 0) + { + merge_lo_p = true; + mergemode = E_V8HImode; + } + } + else if (d.nelt == 8) + { + if (memcmp (d.perm, hi_perm_hi_di, 8) == 0) + { + merge_hi_p = true; + mergemode = E_V2DImode; + } + else if (memcmp (d.perm, hi_perm_hi_si, 8) == 0) + { + merge_hi_p = true; + mergemode = E_V4SImode; + } + else if (memcmp (d.perm, lo_perm_hi_di, 8) == 0) + { + merge_lo_p = true; + mergemode = E_V2DImode; + } + else if (memcmp (d.perm, lo_perm_hi_si, 8) == 0) + { + merge_lo_p = true; + mergemode = E_V4SImode; + } + } + else if (d.nelt == 4) + { + if (memcmp (d.perm, hi_perm_si_di, 4) == 0) + { + merge_hi_p = true; + mergemode = E_V2DImode; + } + else if (memcmp (d.perm, lo_perm_si_di, 4) == 0) + { + merge_lo_p = true; + mergemode = E_V2DImode; + } + } if (!merge_lo_p && !merge_hi_p) return false; @@ -18082,7 +18176,7 @@ expand_perm_with_merge (const struct expand_vec_perm_d &d) if (d.testing_p) return merge_lo_p || merge_hi_p; - rtx op0, op1; + rtx op0, op1, target = d.target; if (swap_operands_p) { op0 = d.op1; @@ -18093,9 +18187,77 @@ expand_perm_with_merge (const struct expand_vec_perm_d &d) op0 = d.op0; op1 = d.op1; } + if (mergemode != d.vmode) + { + target = simplify_gen_subreg (mergemode, target, d.vmode, 0); + op0 = simplify_gen_subreg (mergemode, op0, d.vmode, 0); + op1 = simplify_gen_subreg (mergemode, op1, d.vmode, 0); + } + + s390_expand_merge (target, op0, op1, merge_hi_p); + + return true; +} + +/* Try to expand the vector permute operation described by D using the vector + pack instruction vpk. Return true if vector pack could be used. */ +static bool +expand_perm_with_pack (const struct expand_vec_perm_d &d) +{ + static const unsigned char qi_hi[16] + = {1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31}; + static const unsigned char qi_si[16] + = {2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31}; + static const unsigned char qi_di[16] + = {4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31}; + + static const unsigned char hi_si[8] + = {1, 3, 5, 7, 9, 11, 13, 15}; + static const unsigned char hi_di[8] + = {2, 3, 6, 7, 10, 11, 14, 15}; + + static const unsigned char si_di[4] + = {1, 3, 5, 7}; + + machine_mode packmode, resmode; + enum insn_code code = CODE_FOR_nothing; + + if (d.nelt == 16 && memcmp (d.perm, qi_hi, 16) == 0) + { + packmode = E_V8HImode; + resmode = E_V16QImode; + code = CODE_FOR_vec_pack_trunc_v8hi; + } + else if ((d.nelt == 16 && memcmp (d.perm, qi_si, 16) == 0) + || (d.nelt == 8 && memcmp (d.perm, hi_si, 8) == 0)) + { + packmode = E_V4SImode; + resmode = E_V8HImode; + code = CODE_FOR_vec_pack_trunc_v4si; + } + else if ((d.nelt == 16 && memcmp (d.perm, qi_di, 16) == 0) + || (d.nelt == 8 && memcmp (d.perm, hi_di, 8) == 0) + || (d.nelt == 4 && memcmp (d.perm, si_di, 4) == 0)) + { + packmode = E_V2DImode; + resmode = E_V4SImode; + code = CODE_FOR_vec_pack_trunc_v2di; + } - s390_expand_merge (d.target, op0, op1, merge_hi_p); + if (code == CODE_FOR_nothing) + return false; + if (d.testing_p) + return true; + rtx target = simplify_gen_subreg (resmode, d.target, d.vmode, 0); + rtx op0 = simplify_gen_subreg (packmode, + force_reg (GET_MODE (d.op0), d.op0), + d.vmode, 0); + rtx op1 = simplify_gen_subreg (packmode, + force_reg (GET_MODE (d.op1), d.op1), + d.vmode, 0); + rtx pat = GEN_FCN (code) (target, op0, op1); + emit_insn (pat); return true; } @@ -18322,6 +18484,9 @@ vectorize_vec_perm_const_1 (const struct expand_vec_perm_d &d) if (expand_perm_with_merge (d)) return true; + if (expand_perm_with_pack (d)) + return true; + if (expand_perm_with_vpdi (d)) return true; diff --git a/gcc/testsuite/gcc.target/s390/vector/vec-perm-merge-1.c b/gcc/testsuite/gcc.target/s390/vector/vec-perm-merge-1.c new file mode 100644 index 000000000000..2b639e306888 --- /dev/null +++ b/gcc/testsuite/gcc.target/s390/vector/vec-perm-merge-1.c @@ -0,0 +1,242 @@ +/* { dg-do compile } */ +/* { dg-options "-O3 -mzarch -march=z14 -mzvector --save-temps" } */ +/* { dg-do run { target { s390_z14_hw } } } */ +/* { dg-final {check-function-bodies "**" "" } } */ + +#include "vec-types.h" + +/* +** qi_via_hi_hi: +** vmrhh %v24,%v24,%v26 +** br %r14 +*/ +v16qi __attribute__((noinline,noipa)) +qi_via_hi_hi (v16qi a, v16qi b) +{ + return (v16qi){a[0], a[1], b[0], b[1], a[2], a[3], b[2], b[3], + a[4], a[5], b[4], b[5], a[6], a[7], b[6], b[7]}; +} + +/* +** qi_via_hi_lo: +** vmrlh %v24,%v24,%v26 +** br %r14 +*/ +v16qi __attribute__((noinline,noipa)) +qi_via_hi_lo (v16qi a, v16qi b) +{ + return (v16qi){a[8], a[9], b[8], b[9], a[10], a[11], b[10], b[11], + a[12], a[13], b[12], b[13], a[14], a[15], b[14], b[15]}; +} + +/* +** qi_via_si_hi: +** vmrhf %v24,%v24,%v26 +** br %r14 +*/ +v16qi __attribute__((noinline,noipa)) +qi_via_si_hi (v16qi a, v16qi b) +{ + return (v16qi){a[0], a[1], a[2], a[3], b[0], b[1], b[2], b[3], + a[4], a[5], a[6], a[7], b[4], b[5], b[6], b[7]}; +} + +/* +** qi_via_si_lo: +** vmrlf %v24,%v24,%v26 +** br %r14 +*/ +v16qi __attribute__((noinline,noipa)) +qi_via_si_lo (v16qi a, v16qi b) +{ + return (v16qi){a[8], a[9], a[10], a[11], b[8], b[9], b[10], b[11], + a[12], a[13], a[14], a[15], b[12], b[13], b[14], b[15]}; +} + +/* +** qi_via_di_hi: +** vmrhg %v24,%v24,%v26 +** br %r14 +*/ +v16qi __attribute__((noinline,noipa)) +qi_via_di_hi (v16qi a, v16qi b) +{ + return (v16qi){a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7], + b[0], b[1], b[2], b[3], b[4], b[5], b[6], b[7]}; +} + +/* +** qi_via_di_lo: +** vmrlg %v24,%v24,%v26 +** br %r14 +*/ +v16qi __attribute__((noinline,noipa)) +qi_via_di_lo (v16qi a, v16qi b) +{ + return (v16qi){a[8], a[9], a[10], a[11], a[12], a[13], a[14], a[15], + b[8], b[9], b[10], b[11], b[12], b[13], b[14], b[15]}; +} + +/* +** hi_via_si_hi: +** vmrhf %v24,%v24,%v26 +** br %r14 +*/ +v8hi __attribute__((noinline,noipa)) +hi_via_si_hi (v8hi a, v8hi b) +{ + return (v8hi){a[0], a[1], b[0], b[1], a[2], a[3], b[2], b[3]}; +} + +/* +** hi_via_si_lo: +** vmrlf %v24,%v24,%v26 +** br %r14 +*/ +v8hi __attribute__((noinline,noipa)) +hi_via_si_lo (v8hi a, v8hi b) +{ + return (v8hi){a[4], a[5], b[4], b[5], a[6], a[7], b[6], b[7]}; +} + +/* +** hi_via_di_hi: +** vmrhg %v24,%v24,%v26 +** br %r14 +*/ +v8hi __attribute__((noinline,noipa)) +hi_via_di_hi (v8hi a, v8hi b) +{ + return (v8hi){a[0], a[1], a[2], a[3], b[0], b[1], b[2], b[3]}; +} + +/* +** hi_via_di_lo: +** vmrlg %v24,%v24,%v26 +** br %r14 +*/ +v8hi __attribute__((noinline,noipa)) +hi_via_di_lo (v8hi a, v8hi b) +{ + return (v8hi){a[4], a[5], a[6], a[7], b[4], b[5], b[6], b[7]}; +} + +/* +** si_via_di_hi: +** vmrhg %v24,%v24,%v26 +** br %r14 +*/ +v4si __attribute__((noinline,noipa)) +si_via_di_hi (v4si a, v4si b) +{ + return (v4si){a[0], a[1], b[0], b[1]}; +} + +/* +** si_via_di_lo: +** vmrlg %v24,%v24,%v26 +** br %r14 +*/ +v4si __attribute__((noinline,noipa)) +si_via_di_lo (v4si a, v4si b) +{ + return (v4si){a[2], a[3], b[2], b[3]}; +} + +int +main () +{ + static const signed char e_qi_via_hi_hi[16] + = {0, 1, 16, 17, 2, 3, 18, 19, 4, 5, 20, 21, 6, 7, 22, 23}; + static const signed char e_qi_via_hi_lo[16] + = {8, 9, 24, 25, 10, 11, 26, 27, 12, 13, 28, 29, 14, 15, 30, 31}; + static const signed char e_qi_via_si_hi[16] + = {0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23}; + static const signed char e_qi_via_si_lo[16] + = {8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31}; + static const signed char e_qi_via_di_hi[16] + = {0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23}; + static const signed char e_qi_via_di_lo[16] + = {8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31}; + + static const short e_hi_via_si_hi[8] = {0, 1, 8, 9, 2, 3, 10, 11}; + static const short e_hi_via_si_lo[8] = {4, 5, 12, 13, 6, 7, 14, 15}; + static const short e_hi_via_di_hi[8] = {0, 1, 2, 3, 8, 9, 10, 11}; + static const short e_hi_via_di_lo[8] = {4, 5, 6, 7, 12, 13, 14, 15}; + + static const int e_si_via_di_hi[4] = {0, 1, 4, 5}; + static const int e_si_via_di_lo[4] = {2, 3, 6, 7}; + + v16qi a_qi = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; + v16qi b_qi = {16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}; + v8hi a_hi = {0, 1, 2, 3, 4, 5, 6, 7}; + v8hi b_hi = {8, 9, 10, 11, 12, 13, 14, 15}; + v4si a_si = {0, 1, 2, 3}; + v4si b_si = {4, 5, 6, 7}; + v16qi r_qi; + v8hi r_hi; + v4si r_si; + int i; + + r_qi = qi_via_hi_hi (a_qi, b_qi); + for (i = 0; i < 16; ++i) + if (r_qi[i] != e_qi_via_hi_hi[i]) + __builtin_abort (); + + r_qi = qi_via_hi_lo (a_qi, b_qi); + for (i = 0; i < 16; ++i) + if (r_qi[i] != e_qi_via_hi_lo[i]) + __builtin_abort (); + + r_qi = qi_via_si_hi (a_qi, b_qi); + for (i = 0; i < 16; ++i) + if (r_qi[i] != e_qi_via_si_hi[i]) + __builtin_abort (); + + r_qi = qi_via_si_lo (a_qi, b_qi); + for (i = 0; i < 16; ++i) + if (r_qi[i] != e_qi_via_si_lo[i]) + __builtin_abort (); + + r_qi = qi_via_di_hi (a_qi, b_qi); + for (i = 0; i < 16; ++i) + if (r_qi[i] != e_qi_via_di_hi[i]) + __builtin_abort (); + + r_qi = qi_via_di_lo (a_qi, b_qi); + for (i = 0; i < 16; ++i) + if (r_qi[i] != e_qi_via_di_lo[i]) + __builtin_abort (); + + r_hi = hi_via_si_hi (a_hi, b_hi); + for (i = 0; i < 8; ++i) + if (r_hi[i] != e_hi_via_si_hi[i]) + __builtin_abort (); + + r_hi = hi_via_si_lo (a_hi, b_hi); + for (i = 0; i < 8; ++i) + if (r_hi[i] != e_hi_via_si_lo[i]) + __builtin_abort (); + + r_hi = hi_via_di_hi (a_hi, b_hi); + for (i = 0; i < 8; ++i) + if (r_hi[i] != e_hi_via_di_hi[i]) + __builtin_abort (); + + r_hi = hi_via_di_lo (a_hi, b_hi); + for (i = 0; i < 8; ++i) + if (r_hi[i] != e_hi_via_di_lo[i]) + __builtin_abort (); + + r_si = si_via_di_hi (a_si, b_si); + for (i = 0; i < 4; ++i) + if (r_si[i] != e_si_via_di_hi[i]) + __builtin_abort (); + + r_si = si_via_di_lo (a_si, b_si); + for (i = 0; i < 4; ++i) + if (r_si[i] != e_si_via_di_lo[i]) + __builtin_abort (); + + return 0; +} diff --git a/gcc/testsuite/gcc.target/s390/vector/vec-perm-pack-1.c b/gcc/testsuite/gcc.target/s390/vector/vec-perm-pack-1.c new file mode 100644 index 000000000000..74aedfce6c88 --- /dev/null +++ b/gcc/testsuite/gcc.target/s390/vector/vec-perm-pack-1.c @@ -0,0 +1,133 @@ +/* { dg-do compile } */ +/* { dg-options "-O3 -mzarch -march=z14 -mzvector --save-temps" } */ +/* { dg-do run { target { s390_z14_hw } } } */ +/* { dg-final { check-function-bodies "**" "" } } */ + +#include "vec-types.h" + +/* +** qi_via_hi: +** vpkh %v24,%v24,%v26 +** br %r14 +*/ +v16qi __attribute__((noinline,noipa)) +qi_via_hi (v16qi a, v16qi b) +{ + return (v16qi){a[1], a[3], a[5], a[7], a[9], a[11], a[13], a[15], + b[1], b[3], b[5], b[7], b[9], b[11], b[13], b[15]}; +} + +/* +** qi_via_si: +** vpkf %v24,%v24,%v26 +** br %r14 +*/ +v16qi __attribute__((noinline,noipa)) +qi_via_si (v16qi a, v16qi b) +{ + return (v16qi){a[2], a[3], a[6], a[7], a[10], a[11], a[14], a[15], + b[2], b[3], b[6], b[7], b[10], b[11], b[14], b[15]}; +} + +/* +** qi_via_di: +** vpkg %v24,%v24,%v26 +** br %r14 +*/ +v16qi __attribute__((noinline,noipa)) +qi_via_di (v16qi a, v16qi b) +{ + return (v16qi){a[4], a[5], a[6], a[7], a[12], a[13], a[14], a[15], + b[4], b[5], b[6], b[7], b[12], b[13], b[14], b[15]}; +} + +/* +** hi_via_si: +** vpkf %v24,%v24,%v26 +** br %r14 +*/ +v8hi __attribute__((noinline,noipa)) +hi_via_si (v8hi a, v8hi b) +{ + return (v8hi){a[1], a[3], a[5], a[7], b[1], b[3], b[5], b[7]}; +} + +/* +** hi_via_di: +** vpkg %v24,%v24,%v26 +** br %r14 +*/ +v8hi __attribute__((noinline,noipa)) +hi_via_di (v8hi a, v8hi b) +{ + return (v8hi){a[2], a[3], a[6], a[7], b[2], b[3], b[6], b[7]}; +} + +/* +** si_via_di: +** vpkg %v24,%v24,%v26 +** br %r14 +*/ +v4si __attribute__((noinline,noipa)) +si_via_di (v4si a, v4si b) +{ + return (v4si){a[1], a[3], b[1], b[3]}; +} + +int +main () +{ + static const signed char e_qi_via_hi[16] + = {1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31}; + static const signed char e_qi_via_si[16] + = {2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31}; + static const signed char e_qi_via_di[16] + = {4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31}; + + static const short e_hi_via_si[8] = {1, 3, 5, 7, 9, 11, 13, 15}; + static const short e_hi_via_di[8] = {2, 3, 6, 7, 10, 11, 14, 15}; + + static const int e_si_via_di[4] = {1, 3, 5, 7}; + + v16qi a_qi = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; + v16qi b_qi = {16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}; + v8hi a_hi = {0, 1, 2, 3, 4, 5, 6, 7}; + v8hi b_hi = {8, 9, 10, 11, 12, 13, 14, 15}; + v4si a_si = {0, 1, 2, 3}; + v4si b_si = {4, 5, 6, 7}; + v16qi r_qi; + v8hi r_hi; + v4si r_si; + int i; + + r_qi = qi_via_hi (a_qi, b_qi); + for (i = 0; i < 16; ++i) + if (r_qi[i] != e_qi_via_hi[i]) + __builtin_abort (); + + r_qi = qi_via_si (a_qi, b_qi); + for (i = 0; i < 16; ++i) + if (r_qi[i] != e_qi_via_si[i]) + __builtin_abort (); + + r_qi = qi_via_di (a_qi, b_qi); + for (i = 0; i < 16; ++i) + if (r_qi[i] != e_qi_via_di[i]) + __builtin_abort (); + + r_hi = hi_via_si (a_hi, b_hi); + for (i = 0; i < 8; ++i) + if (r_hi[i] != e_hi_via_si[i]) + __builtin_abort (); + + r_hi = hi_via_di (a_hi, b_hi); + for (i = 0; i < 8; ++i) + if (r_hi[i] != e_hi_via_di[i]) + __builtin_abort (); + + r_si = si_via_di (a_si, b_si); + for (i = 0; i < 4; ++i) + if (r_si[i] != e_si_via_di[i]) + __builtin_abort (); + return 0; +} -- 2.43.5