Hi! Now that there is a better testsuite for constant reshuffling, this patch fixes various issues I found plus improves various permutations. Bootstrapped/regtested on x86_64-linux and i686-linux, additionally tested with GCC_TEST_RUN_EXPENSIVE=1 make check-gcc RUNTESTFLAGS='--target_board=unix\{-msse2,-msse4,-mavx\} dg-torture.exp=vshuf*' on AVX capable box and tested -mavx2 compiled tests on sde. Ok for trunk?
Examples of improvements, say for V16HImode: - vpshuflw $228, a(%rip), %ymm0 + vmovdqa a(%rip), %ymm0 vmovdqa %ymm0, c(%rip) (for identity permutation), ICE vs. + vpbroadcastw a(%rip), %ymm0 + vmovdqa %ymm0, c(%rip) using vpbroadcast* for broadcast shuffle, - vpshufb .LC0(%rip), %ymm0, %ymm1 - vpshufb .LC1(%rip), %ymm0, %ymm0 - vpermq $78, %ymm1, %ymm1 - vpor %ymm1, %ymm0, %ymm0 + vperm2i128 $0, %ymm0, %ymm0, %ymm0 + vpshufb .LC0(%rip), %ymm0, %ymm0 when both lanes refer to just one lane, > 20 insns (full two argument non-constant shuffle) into: + vmovdqa a(%rip), %ymm0 + vpunpcklwd b(%rip), %ymm0, %ymm0 + vpshufb .LC2(%rip), %ymm0, %ymm0 + vmovdqa %ymm0, c(%rip) (resp. vpunpckhwd) when interleave gives something vpshufb can reshuffle afterwards, - vmovdqa a(%rip), %ymm0 - vpshufb .LC11(%rip), %ymm0, %ymm1 - vpshufb .LC12(%rip), %ymm0, %ymm0 - vpermq $78, %ymm1, %ymm1 - vpor %ymm1, %ymm0, %ymm0 + vpermq $156, a(%rip), %ymm0 + vpshufb .LC4(%rip), %ymm0, %ymm0 another case where vpermq can shuffle quadwords into something vpshufb can reshuffle, etc. 2011-10-18 Jakub Jelinek <ja...@redhat.com> * config/i386/i386.c (ix86_expand_vec_perm): In merge_two use mode SUBREG of operands[0] as target. (valid_perm_using_mode_p): Don't ignore higher bits of d->perm. (expand_vec_pshufb): For V8SImode vmode emit avx2_permvarv8si. (expand_vec_perm_1): Handle identity and some broadcast permutations. (expand_vec_perm_interleave2): Handle also 32-byte modes, using vperm2[fi]128 or vpunpck[lh]* followed by single insn permutation. For d->testing_p return true earlier to avoid creating more GC garbage. (expand_vec_perm_vpermq_perm_1): New function. (expand_vec_perm_vpshufb2_vpermq): For d->testing_p return true earlier to avoid creating more GC garbage. Fix handling of V16HImode. Avoid some SUBREGs in SET_DEST. (expand_vec_perm_broadcast_1): Return false for 32-byte integer vector modes. (expand_vec_perm_vpshufb4_vpermq2): New function. (ix86_expand_vec_perm_builtin_1): Call expand_vec_perm_vpermq_perm_1 and expand_vec_perm_vpshufb4_vpermq2. --- gcc/config/i386/i386.c.jj 2011-10-17 22:27:39.000000000 +0200 +++ gcc/config/i386/i386.c 2011-10-18 14:08:58.000000000 +0200 @@ -19663,7 +19663,7 @@ ix86_expand_vec_perm (rtx operands[]) mask = expand_simple_binop (maskmode, AND, mask, vt, NULL_RTX, 0, OPTAB_DIRECT); - xops[0] = operands[0]; + xops[0] = gen_lowpart (mode, operands[0]); xops[1] = gen_lowpart (mode, t2); xops[2] = gen_lowpart (mode, t1); xops[3] = gen_rtx_EQ (maskmode, mask, vt); @@ -35006,8 +35006,7 @@ valid_perm_using_mode_p (enum machine_mo return false; else for (j = 1; j < chunk; ++j) - if ((d->perm[i] & (d->nelt - 1)) + j - != (d->perm[i + j] & (d->nelt - 1))) + if (d->perm[i] + j != d->perm[i + j]) return false; return true; @@ -35138,6 +35137,8 @@ expand_vec_perm_pshufb (struct expand_ve emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm)); else if (vmode == V32QImode) emit_insn (gen_avx2_pshufbv32qi3 (target, op0, vperm)); + else + emit_insn (gen_avx2_permvarv8si (target, vperm, op0)); } else { @@ -35163,9 +35164,58 @@ expand_vec_perm_1 (struct expand_vec_per if (d->op0 == d->op1) { int mask = nelt - 1; + bool identity_perm = true; + bool broadcast_perm = true; for (i = 0; i < nelt; i++) - perm2[i] = d->perm[i] & mask; + { + perm2[i] = d->perm[i] & mask; + if (perm2[i] != i) + identity_perm = false; + if (perm2[i]) + broadcast_perm = false; + } + + if (identity_perm) + { + if (!d->testing_p) + emit_move_insn (d->target, d->op0); + return true; + } + else if (broadcast_perm && TARGET_AVX2) + { + /* Use vpbroadcast{b,w,d}. */ + rtx op = d->op0, (*gen) (rtx, rtx) = NULL; + switch (d->vmode) + { + case V32QImode: + op = gen_lowpart (V16QImode, op); + gen = gen_avx2_pbroadcastv32qi; + break; + case V16HImode: + op = gen_lowpart (V8HImode, op); + gen = gen_avx2_pbroadcastv16hi; + break; + case V8SImode: + op = gen_lowpart (V4SImode, op); + gen = gen_avx2_pbroadcastv8si; + break; + case V16QImode: + gen = gen_avx2_pbroadcastv16qi; + break; + case V8HImode: + gen = gen_avx2_pbroadcastv8hi; + break; + /* For other modes prefer other shuffles this function creates. */ + default: break; + } + if (gen != NULL) + { + if (!d->testing_p) + emit_insn (gen (d->target, op)); + return true; + } + } if (expand_vselect (d->target, d->op0, perm2, nelt)) return true; @@ -35349,93 +35399,210 @@ expand_vec_perm_interleave2 (struct expa { struct expand_vec_perm_d dremap, dfinal; unsigned i, nelt = d->nelt, nelt2 = nelt / 2; - unsigned contents, h1, h2, h3, h4; + unsigned HOST_WIDE_INT contents; unsigned char remap[2 * MAX_VECT_LEN]; rtx seq; - bool ok; - - if (d->op0 == d->op1) - return false; + bool ok, same_halves = false; - /* The 256-bit unpck[lh]p[sd] instructions only operate within the 128-bit - lanes. We can use similar techniques with the vperm2f128 instruction, - but it requires slightly different logic. */ - if (GET_MODE_SIZE (d->vmode) != 16) + if (GET_MODE_SIZE (d->vmode) == 16) + { + if (d->op0 == d->op1) + return false; + } + else if (GET_MODE_SIZE (d->vmode) == 32) + { + if (!TARGET_AVX) + return false; + /* For 32-byte modes allow even d->op0 == d->op1. + The lack of cross-lane shuffling in some instructions + might prevent a single insn shuffle. */ + } + else return false; /* Examine from whence the elements come. */ contents = 0; for (i = 0; i < nelt; ++i) - contents |= 1u << d->perm[i]; - - /* Split the two input vectors into 4 halves. */ - h1 = (1u << nelt2) - 1; - h2 = h1 << nelt2; - h3 = h2 << nelt2; - h4 = h3 << nelt2; + contents |= ((unsigned HOST_WIDE_INT) 1) << d->perm[i]; memset (remap, 0xff, sizeof (remap)); dremap = *d; - /* If the elements from the low halves use interleave low, and similarly - for interleave high. If the elements are from mis-matched halves, we - can use shufps for V4SF/V4SI or do a DImode shuffle. */ - if ((contents & (h1 | h3)) == contents) + if (GET_MODE_SIZE (d->vmode) == 16) { - for (i = 0; i < nelt2; ++i) + unsigned HOST_WIDE_INT h1, h2, h3, h4; + + /* Split the two input vectors into 4 halves. */ + h1 = (((unsigned HOST_WIDE_INT) 1) << nelt2) - 1; + h2 = h1 << nelt2; + h3 = h2 << nelt2; + h4 = h3 << nelt2; + + /* If the elements from the low halves use interleave low, and similarly + for interleave high. If the elements are from mis-matched halves, we + can use shufps for V4SF/V4SI or do a DImode shuffle. */ + if ((contents & (h1 | h3)) == contents) { - remap[i] = i * 2; - remap[i + nelt] = i * 2 + 1; - dremap.perm[i * 2] = i; - dremap.perm[i * 2 + 1] = i + nelt; + /* punpckl* */ + for (i = 0; i < nelt2; ++i) + { + remap[i] = i * 2; + remap[i + nelt] = i * 2 + 1; + dremap.perm[i * 2] = i; + dremap.perm[i * 2 + 1] = i + nelt; + } } - } - else if ((contents & (h2 | h4)) == contents) - { - for (i = 0; i < nelt2; ++i) + else if ((contents & (h2 | h4)) == contents) { - remap[i + nelt2] = i * 2; - remap[i + nelt + nelt2] = i * 2 + 1; - dremap.perm[i * 2] = i + nelt2; - dremap.perm[i * 2 + 1] = i + nelt + nelt2; + /* punpckh* */ + for (i = 0; i < nelt2; ++i) + { + remap[i + nelt2] = i * 2; + remap[i + nelt + nelt2] = i * 2 + 1; + dremap.perm[i * 2] = i + nelt2; + dremap.perm[i * 2 + 1] = i + nelt + nelt2; + } } - } - else if ((contents & (h1 | h4)) == contents) - { - for (i = 0; i < nelt2; ++i) + else if ((contents & (h1 | h4)) == contents) { - remap[i] = i; - remap[i + nelt + nelt2] = i + nelt2; - dremap.perm[i] = i; - dremap.perm[i + nelt2] = i + nelt + nelt2; + /* shufps */ + for (i = 0; i < nelt2; ++i) + { + remap[i] = i; + remap[i + nelt + nelt2] = i + nelt2; + dremap.perm[i] = i; + dremap.perm[i + nelt2] = i + nelt + nelt2; + } + if (nelt != 4) + { + /* shufpd */ + dremap.vmode = V2DImode; + dremap.nelt = 2; + dremap.perm[0] = 0; + dremap.perm[1] = 3; + } } - if (nelt != 4) + else if ((contents & (h2 | h3)) == contents) { - dremap.vmode = V2DImode; - dremap.nelt = 2; - dremap.perm[0] = 0; - dremap.perm[1] = 3; + /* shufps */ + for (i = 0; i < nelt2; ++i) + { + remap[i + nelt2] = i; + remap[i + nelt] = i + nelt2; + dremap.perm[i] = i + nelt2; + dremap.perm[i + nelt2] = i + nelt; + } + if (nelt != 4) + { + /* shufpd */ + dremap.vmode = V2DImode; + dremap.nelt = 2; + dremap.perm[0] = 1; + dremap.perm[1] = 2; + } } + else + return false; } - else if ((contents & (h2 | h3)) == contents) + else { - for (i = 0; i < nelt2; ++i) + unsigned int nelt4 = nelt / 4, nzcnt = 0; + unsigned HOST_WIDE_INT q[8]; + unsigned int nonzero_halves[4]; + + /* Split the two input vectors into 8 quarters. */ + q[0] = (((unsigned HOST_WIDE_INT) 1) << nelt4) - 1; + for (i = 1; i < 8; ++i) + q[i] = q[0] << (nelt4 * i); + for (i = 0; i < 4; ++i) + if (((q[2 * i] | q[2 * i + 1]) & contents) != 0) + { + nonzero_halves[nzcnt] = i; + ++nzcnt; + } + + if (nzcnt == 1) { - remap[i + nelt2] = i; - remap[i + nelt] = i + nelt2; - dremap.perm[i] = i + nelt2; - dremap.perm[i + nelt2] = i + nelt; + gcc_assert (d->op0 == d->op1); + nonzero_halves[1] = nonzero_halves[0]; + same_halves = true; } - if (nelt != 4) + else if (d->op0 == d->op1) { - dremap.vmode = V2DImode; - dremap.nelt = 2; - dremap.perm[0] = 1; - dremap.perm[1] = 2; + gcc_assert (nonzero_halves[0] == 0); + gcc_assert (nonzero_halves[1] == 1); } + + if (nzcnt <= 2) + { + if (d->perm[0] / nelt2 == nonzero_halves[1]) + { + /* Attempt to increase the likelyhood that dfinal + shuffle will be intra-lane. */ + char tmph = nonzero_halves[0]; + nonzero_halves[0] = nonzero_halves[1]; + nonzero_halves[1] = tmph; + } + + /* vperm2f128 or vperm2i128. */ + for (i = 0; i < nelt2; ++i) + { + remap[i + nonzero_halves[1] * nelt2] = i + nelt2; + remap[i + nonzero_halves[0] * nelt2] = i; + dremap.perm[i + nelt2] = i + nonzero_halves[1] * nelt2; + dremap.perm[i] = i + nonzero_halves[0] * nelt2; + } + + if (d->vmode != V8SFmode + && d->vmode != V4DFmode + && d->vmode != V8SImode) + { + dremap.vmode = V8SImode; + dremap.nelt = 8; + for (i = 0; i < 4; ++i) + { + dremap.perm[i] = i + nonzero_halves[0] * 4; + dremap.perm[i + 4] = i + nonzero_halves[1] * 4; + } + } + } + else if (d->op0 == d->op1) + return false; + else if (TARGET_AVX2 + && (contents & (q[0] | q[2] | q[4] | q[6])) == contents) + { + /* vpunpckl* */ + for (i = 0; i < nelt4; ++i) + { + remap[i] = i * 2; + remap[i + nelt] = i * 2 + 1; + remap[i + nelt2] = i * 2 + nelt2; + remap[i + nelt + nelt2] = i * 2 + nelt2 + 1; + dremap.perm[i * 2] = i; + dremap.perm[i * 2 + 1] = i + nelt; + dremap.perm[i * 2 + nelt2] = i + nelt2; + dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2; + } + } + else if (TARGET_AVX2 + && (contents & (q[1] | q[3] | q[5] | q[7])) == contents) + { + /* vpunpckh* */ + for (i = 0; i < nelt4; ++i) + { + remap[i + nelt4] = i * 2; + remap[i + nelt + nelt4] = i * 2 + 1; + remap[i + nelt2 + nelt4] = i * 2 + nelt2; + remap[i + nelt + nelt2 + nelt4] = i * 2 + nelt2 + 1; + dremap.perm[i * 2] = i + nelt4; + dremap.perm[i * 2 + 1] = i + nelt + nelt4; + dremap.perm[i * 2 + nelt2] = i + nelt2 + nelt4; + dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2 + nelt4; + } + } + else + return false; } - else - return false; /* Use the remapping array set up above to move the elements from their swizzled locations into their final destinations. */ @@ -35444,7 +35611,15 @@ expand_vec_perm_interleave2 (struct expa { unsigned e = remap[d->perm[i]]; gcc_assert (e < nelt); - dfinal.perm[i] = e; + /* If same_halves is true, both halves of the remapped vector are the + same. Avoid cross-lane accesses if possible. */ + if (same_halves && i >= nelt2) + { + gcc_assert (e < nelt2); + dfinal.perm[i] = e + nelt2; + } + else + dfinal.perm[i] = e; } dfinal.op0 = gen_reg_rtx (dfinal.vmode); dfinal.op1 = dfinal.op0; @@ -35460,6 +35635,9 @@ expand_vec_perm_interleave2 (struct expa if (!ok) return false; + if (d->testing_p) + return true; + if (dremap.vmode != dfinal.vmode) { dremap.target = gen_lowpart (dremap.vmode, dremap.target); @@ -35475,6 +35653,83 @@ expand_vec_perm_interleave2 (struct expa } /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify + a single vector cross-lane permutation into vpermq followed + by any of the single insn permutations. */ + +static bool +expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d *d) +{ + struct expand_vec_perm_d dremap, dfinal; + unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, nelt4 = nelt / 4; + unsigned contents[2]; + bool ok; + + if (!(TARGET_AVX2 + && (d->vmode == V32QImode || d->vmode == V16HImode) + && d->op0 == d->op1)) + return false; + + contents[0] = 0; + contents[1] = 0; + for (i = 0; i < nelt2; ++i) + { + contents[0] |= 1u << (d->perm[i] / nelt4); + contents[1] |= 1u << (d->perm[i + nelt2] / nelt4); + } + + for (i = 0; i < 2; ++i) + { + unsigned int cnt = 0; + for (j = 0; j < 4; ++j) + if ((contents[i] & (1u << j)) != 0 && ++cnt > 2) + return false; + } + + if (d->testing_p) + return true; + + dremap = *d; + dremap.vmode = V4DImode; + dremap.nelt = 4; + dremap.target = gen_reg_rtx (V4DImode); + dremap.op0 = gen_lowpart (V4DImode, d->op0); + dremap.op1 = dremap.op0; + for (i = 0; i < 2; ++i) + { + unsigned int cnt = 0; + for (j = 0; j < 4; ++j) + if ((contents[i] & (1u << j)) != 0) + dremap.perm[2 * i + cnt++] = j; + for (; cnt < 2; ++cnt) + dremap.perm[2 * i + cnt] = 0; + } + + dfinal = *d; + dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target); + dfinal.op1 = dfinal.op0; + for (i = 0, j = 0; i < nelt; ++i) + { + if (i == nelt2) + j = 2; + dfinal.perm[i] = (d->perm[i] & (nelt4 - 1)) | (j ? nelt2 : 0); + if ((d->perm[i] / nelt4) == dremap.perm[j]) + ; + else if ((d->perm[i] / nelt4) == dremap.perm[j + 1]) + dfinal.perm[i] |= nelt4; + else + gcc_unreachable (); + } + + ok = expand_vec_perm_1 (&dremap); + gcc_assert (ok); + + ok = expand_vec_perm_1 (&dfinal); + gcc_assert (ok); + + return true; +} + +/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify a two vector permutation using 2 intra-lane interleave insns and cross-lane shuffle for 32-byte vectors. */ @@ -35621,6 +35876,9 @@ expand_vec_perm_vpshufb2_vpermq (struct || (d->vmode != V32QImode && d->vmode != V16HImode)) return false; + if (d->testing_p) + return true; + nelt = d->nelt; eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode)); @@ -35635,12 +35893,12 @@ expand_vec_perm_vpshufb2_vpermq (struct for (i = 0; i < nelt; ++i) { unsigned j, e = d->perm[i] & (nelt / 2 - 1); - unsigned which = ((d->perm[i] ^ i) & (nelt / 2)); + unsigned which = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz; for (j = 0; j < eltsz; ++j) { rperm[!!which][(i * eltsz + j) ^ which] = GEN_INT (e * eltsz + j); - rperm[!which][(i * eltsz + j) ^ (which ^ (nelt / 2))] = m128; + rperm[!which][(i * eltsz + j) ^ (which ^ 16)] = m128; } } @@ -35652,10 +35910,9 @@ expand_vec_perm_vpshufb2_vpermq (struct emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm)); /* Swap the 128-byte lanes of h into hp. */ - hp = gen_reg_rtx (V32QImode); + hp = gen_reg_rtx (V4DImode); op = gen_lowpart (V4DImode, h); - emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, hp), op, - const2_rtx, GEN_INT (3), const0_rtx, + emit_insn (gen_avx2_permv4di_1 (hp, op, const2_rtx, GEN_INT (3), const0_rtx, const1_rtx)); vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0])); @@ -35666,7 +35923,7 @@ expand_vec_perm_vpshufb2_vpermq (struct emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm)); op = gen_lowpart (V32QImode, d->target); - emit_insn (gen_iorv32qi3 (op, l, hp)); + emit_insn (gen_iorv32qi3 (op, l, gen_lowpart (V32QImode, hp))); return true; } @@ -35994,6 +36251,15 @@ expand_vec_perm_broadcast_1 (struct expa gcc_assert (ok); return true; + case V32QImode: + case V16HImode: + case V8SImode: + case V4DImode: + /* For AVX2 broadcasts of the first element vpbroadcast* or + vpermq should be used by expand_vec_perm_1. */ + gcc_assert (!TARGET_AVX2 || d->perm[0]); + return false; + default: gcc_unreachable (); } @@ -36018,6 +36284,117 @@ expand_vec_perm_broadcast (struct expand return expand_vec_perm_broadcast_1 (d); } +/* Implement arbitrary permutation of two V32QImode and V16QImode operands + with 4 vpshufb insns, 2 vpermq and 3 vpor. We should have already failed + all the shorter instruction sequences. */ + +static bool +expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d *d) +{ + rtx rperm[4][32], vperm, l[2], h[2], op, m128; + unsigned int i, nelt, eltsz; + bool used[4]; + + if (!TARGET_AVX2 + || d->op0 == d->op1 + || (d->vmode != V32QImode && d->vmode != V16HImode)) + return false; + + if (d->testing_p) + return true; + + nelt = d->nelt; + eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode)); + + /* Generate 4 permutation masks. If the required element is within + the same lane, it is shuffled in. If the required element from the + other lane, force a zero by setting bit 7 in the permutation mask. + In the other mask the mask has non-negative elements if element + is requested from the other lane, but also moved to the other lane, + so that the result of vpshufb can have the two V2TImode halves + swapped. */ + m128 = GEN_INT (-128); + for (i = 0; i < 32; ++i) + { + rperm[0][i] = m128; + rperm[1][i] = m128; + rperm[2][i] = m128; + rperm[3][i] = m128; + } + used[0] = false; + used[1] = false; + used[2] = false; + used[3] = false; + for (i = 0; i < nelt; ++i) + { + unsigned j, e = d->perm[i] & (nelt / 2 - 1); + unsigned xlane = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz; + unsigned int which = ((d->perm[i] & nelt) ? 2 : 0) + (xlane ? 1 : 0); + + for (j = 0; j < eltsz; ++j) + rperm[which][(i * eltsz + j) ^ xlane] = GEN_INT (e * eltsz + j); + used[which] = true; + } + + for (i = 0; i < 2; ++i) + { + if (!used[2 * i + 1]) + { + h[i] = NULL_RTX; + continue; + } + vperm = gen_rtx_CONST_VECTOR (V32QImode, + gen_rtvec_v (32, rperm[2 * i + 1])); + vperm = force_reg (V32QImode, vperm); + h[i] = gen_reg_rtx (V32QImode); + op = gen_lowpart (V32QImode, i ? d->op1 : d->op0); + emit_insn (gen_avx2_pshufbv32qi3 (h[i], op, vperm)); + } + + /* Swap the 128-byte lanes of h[X]. */ + for (i = 0; i < 2; ++i) + { + if (h[i] == NULL_RTX) + continue; + op = gen_reg_rtx (V4DImode); + emit_insn (gen_avx2_permv4di_1 (op, gen_lowpart (V4DImode, h[i]), + const2_rtx, GEN_INT (3), const0_rtx, + const1_rtx)); + h[i] = gen_lowpart (V32QImode, op); + } + + for (i = 0; i < 2; ++i) + { + if (!used[2 * i]) + { + l[i] = NULL_RTX; + continue; + } + vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[2 * i])); + vperm = force_reg (V32QImode, vperm); + l[i] = gen_reg_rtx (V32QImode); + op = gen_lowpart (V32QImode, i ? d->op1 : d->op0); + emit_insn (gen_avx2_pshufbv32qi3 (l[i], op, vperm)); + } + + for (i = 0; i < 2; ++i) + { + if (h[i] && l[i]) + { + op = gen_reg_rtx (V32QImode); + emit_insn (gen_iorv32qi3 (op, l[i], h[i])); + l[i] = op; + } + else if (h[i]) + l[i] = h[i]; + } + + gcc_assert (l[0] && l[1]); + op = gen_lowpart (V32QImode, d->target); + emit_insn (gen_iorv32qi3 (op, l[0], l[1])); + return true; +} + /* The guts of ix86_expand_vec_perm_builtin, also used by the ok hook. With all of the interface bits taken care of, perform the expansion in D and return true on success. */ @@ -36043,6 +36420,9 @@ ix86_expand_vec_perm_builtin_1 (struct e if (expand_vec_perm_broadcast (d)) return true; + if (expand_vec_perm_vpermq_perm_1 (d)) + return true; + /* Try sequences of three instructions. */ if (expand_vec_perm_pshufb2 (d)) @@ -36072,6 +36452,10 @@ ix86_expand_vec_perm_builtin_1 (struct e if (expand_vec_perm_even_odd (d)) return true; + /* Even longer sequences. */ + if (expand_vec_perm_vpshufb4_vpermq2 (d)) + return true; + return false; } Jakub