After the PR92818 commit there's one function in the testcase below remaining not using bit-inserts. The following fixes this.
Bootstrapped on x86_64-unknown-linux-gnu, testing in progress. Richard. 2019-12-05 Richard Biener <rguent...@suse.de> PR tree-optimization/92819 * match.pd (VEC_PERM_EXPR -> BIT_INSERT_EXPR): Handle inserts into the last lane. For two-element vectors try inserting into the last lane when inserting into the first fails. * gcc.target/i386/pr92819-1.c: New testcase. Index: gcc/testsuite/gcc.target/i386/pr92819-1.c =================================================================== --- gcc/testsuite/gcc.target/i386/pr92819-1.c (nonexistent) +++ gcc/testsuite/gcc.target/i386/pr92819-1.c (working copy) @@ -0,0 +1,20 @@ +/* { dg-do compile } */ +/* { dg-options "-O -msse2 -fdump-tree-forwprop1" } */ + +typedef double v2df __attribute__((vector_size (16))); + +v2df +foo (v2df x, double *p) +{ + return (v2df) { x[0], *p }; +} + +v2df +bar (v2df x, double *p) +{ + return (v2df) { *p, x[1] }; +} + +/* { dg-final { scan-tree-dump-times "BIT_INSERT_EXPR" 2 "forwprop1" } } */ +/* { dg-final { scan-assembler "movhpd" } } */ +/* { dg-final { scan-assembler "movlpd" } } */ Index: gcc/match.pd =================================================================== --- gcc/match.pd (revision 278998) +++ gcc/match.pd (working copy) @@ -6032,7 +6032,8 @@ (define_operator_list COND_TERNARY || TREE_CODE (cop1) == VECTOR_CST || TREE_CODE (cop1) == CONSTRUCTOR)) { - if (sel.series_p (1, 1, nelts + 1, 1)) + bool insert_first_p = sel.series_p (1, 1, nelts + 1, 1); + if (insert_first_p) { /* After canonicalizing the first elt to come from the first vector we only can insert the first elt from @@ -6041,13 +6042,19 @@ (define_operator_list COND_TERNARY if ((ins = fold_read_from_vector (cop0, sel[0]))) op0 = op1; } - else + /* The above can fail for two-element vectors which always + appear to insert the first element, so try inserting + into the second lane as well. For more than two + elements that's wasted time. */ + if (!insert_first_p || (!ins && maybe_eq (nelts, 2u))) { unsigned int encoded_nelts = sel.encoding ().encoded_nelts (); for (at = 0; at < encoded_nelts; ++at) if (maybe_ne (sel[at], at)) break; - if (at < encoded_nelts && sel.series_p (at + 1, 1, at + 1, 1)) + if (at < encoded_nelts + && (known_eq (at + 1, nelts) + || sel.series_p (at + 1, 1, at + 1, 1))) { if (known_lt (poly_uint64 (sel[at]), nelts)) ins = fold_read_from_vector (cop0, sel[at]);