Hi! This patch adds VI_256 vec_interleave_{high,low}<mode> as well as using it in the vector expander. While it needs 3 insns for each, the first two will be actually CSEd if both patterns are expanded (the usual case from the vectorizer, e.g. for vect-strided-store-u32-i2.c), so we end up with 2 vunpck* insns followed by 2 vperm2i128 insns.
Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk? 2011-10-14 Jakub Jelinek <ja...@redhat.com> * config/i386/sse.md (vec_interleave_high<mode>, vec_interleave_low<mode>): Add AVX2 expanders for VI_256 modes. * config/i386/i386.c (expand_vec_perm_interleave3): New function. (ix86_expand_vec_perm_builtin_1): Call it. --- gcc/config/i386/sse.md.jj 2011-10-13 14:50:15.000000000 +0200 +++ gcc/config/i386/sse.md 2011-10-13 17:34:26.000000000 +0200 @@ -6765,6 +6765,38 @@ (define_insn "vec_interleave_lowv4si" (set_attr "prefix" "orig,vex") (set_attr "mode" "TI")]) +(define_expand "vec_interleave_high<mode>" + [(match_operand:VI_256 0 "register_operand" "=x") + (match_operand:VI_256 1 "register_operand" "x") + (match_operand:VI_256 2 "nonimmediate_operand" "xm")] + "TARGET_AVX2" +{ + rtx t1 = gen_reg_rtx (<MODE>mode); + rtx t2 = gen_reg_rtx (<MODE>mode); + emit_insn (gen_avx2_interleave_low<mode> (t1, operands[1], operands[2])); + emit_insn (gen_avx2_interleave_high<mode> (t2, operands[1], operands[2])); + emit_insn (gen_avx2_permv2ti (gen_lowpart (V4DImode, operands[0]), + gen_lowpart (V4DImode, t1), + gen_lowpart (V4DImode, t2), GEN_INT (1 + (3 << 4)))); + DONE; +}) + +(define_expand "vec_interleave_low<mode>" + [(match_operand:VI_256 0 "register_operand" "=x") + (match_operand:VI_256 1 "register_operand" "x") + (match_operand:VI_256 2 "nonimmediate_operand" "xm")] + "TARGET_AVX2" +{ + rtx t1 = gen_reg_rtx (<MODE>mode); + rtx t2 = gen_reg_rtx (<MODE>mode); + emit_insn (gen_avx2_interleave_low<mode> (t1, operands[1], operands[2])); + emit_insn (gen_avx2_interleave_high<mode> (t2, operands[1], operands[2])); + emit_insn (gen_avx2_permv2ti (gen_lowpart (V4DImode, operands[0]), + gen_lowpart (V4DImode, t1), + gen_lowpart (V4DImode, t2), GEN_INT (0 + (2 << 4)))); + DONE; +}) + ;; Modes handled by pinsr patterns. (define_mode_iterator PINSR_MODE [(V16QI "TARGET_SSE4_1") V8HI --- gcc/config/i386/i386.c.jj 2011-10-13 11:56:19.000000000 +0200 +++ gcc/config/i386/i386.c 2011-10-13 18:36:58.000000000 +0200 @@ -35474,6 +35474,82 @@ expand_vec_perm_interleave2 (struct expa return true; } +/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify + a two vector permutation using 2 intra-lane interleave insns + and cross-lane shuffle for 32-byte vectors. */ + +static bool +expand_vec_perm_interleave3 (struct expand_vec_perm_d *d) +{ + unsigned i, nelt; + rtx (*gen) (rtx, rtx, rtx); + + if (d->op0 == d->op1) + return false; + if (TARGET_AVX2 && GET_MODE_SIZE (d->vmode) == 32) + ; + else if (TARGET_AVX && (d->vmode == V8SFmode || d->vmode == V4DFmode)) + ; + else + return false; + + nelt = d->nelt; + if (d->perm[0] != 0 && d->perm[0] != nelt / 2) + return false; + for (i = 0; i < nelt; i += 2) + if (d->perm[i] != d->perm[0] + i / 2 + || d->perm[i + 1] != d->perm[0] + i / 2 + nelt) + return false; + + if (d->testing_p) + return true; + + switch (d->vmode) + { + case V32QImode: + if (d->perm[0]) + gen = gen_vec_interleave_highv32qi; + else + gen = gen_vec_interleave_lowv32qi; + break; + case V16HImode: + if (d->perm[0]) + gen = gen_vec_interleave_highv16hi; + else + gen = gen_vec_interleave_lowv16hi; + break; + case V8SImode: + if (d->perm[0]) + gen = gen_vec_interleave_highv8si; + else + gen = gen_vec_interleave_lowv8si; + break; + case V4DImode: + if (d->perm[0]) + gen = gen_vec_interleave_highv4di; + else + gen = gen_vec_interleave_lowv4di; + break; + case V8SFmode: + if (d->perm[0]) + gen = gen_vec_interleave_highv8sf; + else + gen = gen_vec_interleave_lowv8sf; + break; + case V4DFmode: + if (d->perm[0]) + gen = gen_vec_interleave_highv4df; + else + gen = gen_vec_interleave_lowv4df; + break; + default: + gcc_unreachable (); + } + + emit_insn (gen (d->target, d->op0, d->op1)); + return true; +} + /* A subroutine of expand_vec_perm_even_odd_1. Implement the double-word permutation with two pshufb insns and an ior. We should have already failed all two instruction sequences. */ @@ -35972,6 +36048,9 @@ ix86_expand_vec_perm_builtin_1 (struct e if (expand_vec_perm_pshufb2 (d)) return true; + if (expand_vec_perm_interleave3 (d)) + return true; + /* Try sequences of four instructions. */ if (expand_vec_perm_vpshufb2_vpermq (d)) Jakub