Deal with cases where vpdi and vmr{l,h} are still applicable if the operands of those instructions are swapped. For example, currently for
V2DI foo (V2DI x) { return (V2DI) {x[1], x[0]}; } the assembler sequence vlgvg %r1,%v24,1 vzero %v0 vlvgg %v0,%r1,0 vmrhg %v24,%v0,%v24 is emitted. With this patch a single vpdi is emitted. Extensive tests are included in a subsequent patch of this series where more cases are covered. Bootstrapped and regtested on s390. Ok for mainline? gcc/ChangeLog: * config/s390/s390.cc (expand_perm_with_merge): Deal with cases where vmr{l,h} are still applicable if the operands are swapped. (expand_perm_with_vpdi): Likewise for vpdi. --- gcc/config/s390/s390.cc | 118 ++++++++++++++++++++++++++++++---------- 1 file changed, 90 insertions(+), 28 deletions(-) diff --git a/gcc/config/s390/s390.cc b/gcc/config/s390/s390.cc index 64f56d8effa..185eb59f8b8 100644 --- a/gcc/config/s390/s390.cc +++ b/gcc/config/s390/s390.cc @@ -17532,40 +17532,86 @@ struct expand_vec_perm_d static bool expand_perm_with_merge (const struct expand_vec_perm_d &d) { - bool merge_lo_p = true; - bool merge_hi_p = true; - - if (d.nelt % 2) + static const unsigned char hi_perm_di[2] = {0, 2}; + static const unsigned char hi_perm_si[4] = {0, 4, 1, 5}; + static const unsigned char hi_perm_hi[8] = {0, 8, 1, 9, 2, 10, 3, 11}; + static const unsigned char hi_perm_qi[16] + = {0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23}; + + static const unsigned char hi_perm_di_swap[2] = {2, 0}; + static const unsigned char hi_perm_si_swap[4] = {4, 0, 6, 2}; + static const unsigned char hi_perm_hi_swap[8] = {8, 0, 10, 2, 12, 4, 14, 6}; + static const unsigned char hi_perm_qi_swap[16] + = {16, 0, 18, 2, 20, 4, 22, 6, 24, 8, 26, 10, 28, 12, 30, 14}; + + static const unsigned char lo_perm_di[2] = {1, 3}; + static const unsigned char lo_perm_si[4] = {2, 6, 3, 7}; + static const unsigned char lo_perm_hi[8] = {4, 12, 5, 13, 6, 14, 7, 15}; + static const unsigned char lo_perm_qi[16] + = {8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31}; + + static const unsigned char lo_perm_di_swap[2] = {3, 1}; + static const unsigned char lo_perm_si_swap[4] = {5, 1, 7, 3}; + static const unsigned char lo_perm_hi_swap[8] = {9, 1, 11, 3, 13, 5, 15, 7}; + static const unsigned char lo_perm_qi_swap[16] + = {17, 1, 19, 3, 21, 5, 23, 7, 25, 9, 27, 11, 29, 13, 31, 15}; + + bool merge_lo_p = false; + bool merge_hi_p = false; + bool swap_operands_p = false; + + if ((d.nelt == 2 && memcmp (d.perm, hi_perm_di, 2) == 0) + || (d.nelt == 4 && memcmp (d.perm, hi_perm_si, 4) == 0) + || (d.nelt == 8 && memcmp (d.perm, hi_perm_hi, 8) == 0) + || (d.nelt == 16 && memcmp (d.perm, hi_perm_qi, 16) == 0)) + { + merge_hi_p = true; + } + else if ((d.nelt == 2 && memcmp (d.perm, hi_perm_di_swap, 2) == 0) + || (d.nelt == 4 && memcmp (d.perm, hi_perm_si_swap, 4) == 0) + || (d.nelt == 8 && memcmp (d.perm, hi_perm_hi_swap, 8) == 0) + || (d.nelt == 16 && memcmp (d.perm, hi_perm_qi_swap, 16) == 0)) + { + merge_hi_p = true; + swap_operands_p = true; + } + else if ((d.nelt == 2 && memcmp (d.perm, lo_perm_di, 2) == 0) + || (d.nelt == 4 && memcmp (d.perm, lo_perm_si, 4) == 0) + || (d.nelt == 8 && memcmp (d.perm, lo_perm_hi, 8) == 0) + || (d.nelt == 16 && memcmp (d.perm, lo_perm_qi, 16) == 0)) + { + merge_lo_p = true; + } + else if ((d.nelt == 2 && memcmp (d.perm, lo_perm_di_swap, 2) == 0) + || (d.nelt == 4 && memcmp (d.perm, lo_perm_si_swap, 4) == 0) + || (d.nelt == 8 && memcmp (d.perm, lo_perm_hi_swap, 8) == 0) + || (d.nelt == 16 && memcmp (d.perm, lo_perm_qi_swap, 16) == 0)) + { + merge_lo_p = true; + swap_operands_p = true; + } + + if (!merge_lo_p && !merge_hi_p) return false; - // For V4SI this checks for: { 0, 4, 1, 5 } - for (int telt = 0; telt < d.nelt; telt++) - if (d.perm[telt] != telt / 2 + (telt % 2) * d.nelt) - { - merge_hi_p = false; - break; - } + if (d.testing_p) + return merge_lo_p || merge_hi_p; - if (!merge_hi_p) + rtx op0, op1; + if (swap_operands_p) { - // For V4SI this checks for: { 2, 6, 3, 7 } - for (int telt = 0; telt < d.nelt; telt++) - if (d.perm[telt] != (telt + d.nelt) / 2 + (telt % 2) * d.nelt) - { - merge_lo_p = false; - break; - } + op0 = d.op1; + op1 = d.op0; } else - merge_lo_p = false; - - if (d.testing_p) - return merge_lo_p || merge_hi_p; + { + op0 = d.op0; + op1 = d.op1; + } - if (merge_lo_p || merge_hi_p) - s390_expand_merge (d.target, d.op0, d.op1, merge_hi_p); + s390_expand_merge (d.target, op0, op1, merge_hi_p); - return merge_lo_p || merge_hi_p; + return true; } /* Try to expand the vector permute operation described by D using the @@ -17582,6 +17628,7 @@ expand_perm_with_vpdi (const struct expand_vec_perm_d &d) { bool vpdi1_p = false; bool vpdi4_p = false; + bool swap_operands_p = false; rtx op0_reg, op1_reg; // Only V2DI and V2DF are supported here. @@ -17590,11 +17637,20 @@ expand_perm_with_vpdi (const struct expand_vec_perm_d &d) if (d.perm[0] == 0 && d.perm[1] == 3) vpdi1_p = true; - - if ((d.perm[0] == 1 && d.perm[1] == 2) + else if (d.perm[0] == 2 && d.perm[1] == 1) + { + vpdi1_p = true; + swap_operands_p = true; + } + else if ((d.perm[0] == 1 && d.perm[1] == 2) || (d.perm[0] == 1 && d.perm[1] == 0) || (d.perm[0] == 3 && d.perm[1] == 2)) vpdi4_p = true; + else if (d.perm[0] == 3 && d.perm[1] == 0) + { + vpdi4_p = true; + swap_operands_p = true; + } if (!vpdi1_p && !vpdi4_p) return false; @@ -17611,6 +17667,12 @@ expand_perm_with_vpdi (const struct expand_vec_perm_d &d) op1_reg = op0_reg; else if (d.only_op1) op0_reg = op1_reg; + else if (swap_operands_p) + { + rtx tmp = op0_reg; + op0_reg = op1_reg; + op1_reg = tmp; + } if (vpdi1_p) emit_insn (gen_vpdi1 (d.vmode, d.target, op0_reg, op1_reg)); -- 2.41.0