Hi! On Tue, 21 May 2019 13:11:54 +0200 (CEST), Richard Biener <rguent...@suse.de> wrote: > On Mon, 20 May 2019, Richard Biener wrote: > > On Sun, 19 May 2019, Richard Sandiford wrote: > > > Richard Biener <rguent...@suse.de> writes: > > > > This adds, incrementally ontop of moving VEC_PERM_EXPR folding > > > > to match.pd (but not incremental patch - sorry...), folding > > > > of single-element insert permutations to BIT_INSERT_EXPR.
> And the following is what I applied after fixing all sign-compare > issues. > > Bootstraped and tested on x86_64-unknown-linux-gnu. I'm seeing one regression, in 'gcc/testsuite/brig/brig.sum': @@ -126,7 +126,7 @@ PASS: packed.hsail.brig scan-tree-dump original "\\+ { 15, 14, 13, 12, 11, 10, 9 PASS: packed.hsail.brig scan-tree-dump gimple "= { 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15 };[\n ]+[a-z0-9_]+ = [a-z0-9_]+ \\+ [a-z0-9_]+;" PASS: packed.hsail.brig scan-tree-dump gimple "VEC_PERM_EXPR <[a-z0-9_]+, [a-z0-9_]+, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }>;" PASS: packed.hsail.brig scan-tree-dump gimple "_[0-9]+ = q2 \\+ q3;" [-PASS:-]{+FAIL:+} packed.hsail.brig scan-tree-dump gimple "= VEC_PERM_EXPR <[a-z0-9_]+, new_output.[0-9]+_[0-9]+, { 16, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }>;" PASS: packed.hsail.brig scan-tree-dump gimple "q4 = (VIEW_CONVERT_EXPR<uint128_t>\\()?s_output.[0-9]+(_[0-9]+)*\\)?;" PASS: packed.hsail.brig scan-tree-dump-times gimple "= __builtin___hsail_sat_add_u8" 64 PASS: packed.hsail.brig scan-tree-dump gimple "= VIEW_CONVERT_EXPR<vector\\(8\\) signed short>\\((s_output.[0-9]+_[0-9]+|q8)\\);[\n ]+q9 = -_[0-9]+;[\n ]+" The before vs. after tree dump changed as follows: --- packed.hsail.brig.005t.gimple 2019-05-22 14:29:48.810860260 +0200 +++ packed.hsail.brig.005t.gimple 2019-05-22 14:31:32.936670016 +0200 @@ -112,7 +112,7 @@ _26 = q2 + q3; new_output.11 = _26; new_output.21_27 = new_output.11; - _28 = VEC_PERM_EXPR <q4, new_output.21_27, { 16, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }>; + _28 = VEC_PERM_EXPR <new_output.21_27, q4, { 0, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 }>; s_output.12 = _28; q4 = s_output.12; _29 = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 }; @@ -369,7 +369,7 @@ vec_out.22_273 = vec_out.16; new_output.17 = vec_out.22_273; new_output.23_274 = new_output.17; - _275 = VEC_PERM_EXPR <q8, new_output.23_274, { 16, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }>; + _275 = VEC_PERM_EXPR <new_output.23_274, q8, { 0, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 }>; s_output.18 = _275; q8 = s_output.18; _276 = VIEW_CONVERT_EXPR<vector(8) signed short>(q8); Will it be OK to just commit the obvious patch to adjust the tree dump scanning, or is something actually wrong? (As you can tell, so far I didn't look up what that actually means -- hoping you can tell from the little bit of context?) Grüße Thomas > 2019-05-21 Richard Biener <rguent...@suse.de> > > PR middle-end/90510 > * fold-const.c (fold_read_from_vector): New function. > * fold-const.h (fold_read_from_vector): Declare. > * match.pd (VEC_PERM_EXPR): Build BIT_INSERT_EXPRs for > single-element insert permutations. Canonicalize selector > further and fix issue with last commit. > > * gcc.target/i386/pr90510.c: New testcase. > > Index: gcc/fold-const.c > =================================================================== > --- gcc/fold-const.c (revision 271412) > +++ gcc/fold-const.c (working copy) > @@ -13793,6 +13793,28 @@ fold_read_from_constant_string (tree exp > return NULL; > } > > +/* Folds a read from vector element at IDX of vector ARG. */ > + > +tree > +fold_read_from_vector (tree arg, poly_uint64 idx) > +{ > + unsigned HOST_WIDE_INT i; > + if (known_lt (idx, TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg))) > + && known_ge (idx, 0u) > + && idx.is_constant (&i)) > + { > + if (TREE_CODE (arg) == VECTOR_CST) > + return VECTOR_CST_ELT (arg, i); > + else if (TREE_CODE (arg) == CONSTRUCTOR) > + { > + if (i >= CONSTRUCTOR_NELTS (arg)) > + return build_zero_cst (TREE_TYPE (TREE_TYPE (arg))); > + return CONSTRUCTOR_ELT (arg, i)->value; > + } > + } > + return NULL_TREE; > +} > + > /* Return the tree for neg (ARG0) when ARG0 is known to be either > an integer constant, real, or fixed-point constant. > > Index: gcc/fold-const.h > =================================================================== > --- gcc/fold-const.h (revision 271412) > +++ gcc/fold-const.h (working copy) > @@ -100,6 +100,7 @@ extern tree fold_bit_and_mask (tree, tre > tree, enum tree_code, tree, tree, > tree, enum tree_code, tree, tree, tree *); > extern tree fold_read_from_constant_string (tree); > +extern tree fold_read_from_vector (tree, poly_uint64); > #if GCC_VEC_PERN_INDICES_H > extern tree fold_vec_perm (tree, tree, tree, const vec_perm_indices &); > #endif > Index: gcc/match.pd > =================================================================== > --- gcc/match.pd (revision 271412) > +++ gcc/match.pd (working copy) > @@ -5406,6 +5406,11 @@ (define_operator_list COND_TERNARY > op0 = op1; > sel.rotate_inputs (1); > } > + else if (known_ge (poly_uint64 (sel[0]), nelts)) > + { > + std::swap (op0, op1); > + sel.rotate_inputs (1); > + } > } > gassign *def; > tree cop0 = op0, cop1 = op1; > @@ -5429,9 +5434,46 @@ (define_operator_list COND_TERNARY > (with > { > bool changed = (op0 == op1 && !single_arg); > + tree ins = NULL_TREE; > + unsigned at = 0; > + > + /* See if the permutation is performing a single element > + insert from a CONSTRUCTOR or constant and use a BIT_INSERT_EXPR > + in that case. But only if the vector mode is supported, > + otherwise this is invalid GIMPLE. */ > + if (TYPE_MODE (type) != BLKmode > + && (TREE_CODE (cop0) == VECTOR_CST > + || TREE_CODE (cop0) == CONSTRUCTOR > + || TREE_CODE (cop1) == VECTOR_CST > + || TREE_CODE (cop1) == CONSTRUCTOR)) > + { > + if (sel.series_p (1, 1, nelts + 1, 1)) > + { > + /* After canonicalizing the first elt to come from the > + first vector we only can insert the first elt from > + the first vector. */ > + at = 0; > + ins = fold_read_from_vector (cop0, 0); > + op0 = op1; > + } > + else > + { > + unsigned int encoded_nelts = sel.encoding ().encoded_nelts (); > + for (at = 0; at < encoded_nelts; ++at) > + if (maybe_ne (sel[at], at)) > + break; > + if (at < encoded_nelts && sel.series_p (at + 1, 1, at + 1, 1)) > + { > + if (known_lt (at, nelts)) > + ins = fold_read_from_vector (cop0, sel[at]); > + else > + ins = fold_read_from_vector (cop1, sel[at] - nelts); > + } > + } > + } > > /* Generate a canonical form of the selector. */ > - if (sel.encoding () != builder) > + if (!ins && sel.encoding () != builder) > { > /* Some targets are deficient and fail to expand a single > argument permutation while still allowing an equivalent > @@ -5450,10 +5492,12 @@ (define_operator_list COND_TERNARY > so use the preferred form. */ > op2 = vec_perm_indices_to_tree (TREE_TYPE (op2), sel); > } > - /* Differences in the encoder do not necessarily mean > - differences in the resulting vector. */ > - changed = !operand_equal_p (op2, oldop2, 0); > + if (!operand_equal_p (op2, oldop2, 0)) > + changed = true; > } > } > - (if (changed) > - (vec_perm { op0; } { op1; } { op2; }))))))))) > + (if (ins) > + (bit_insert { op0; } { ins; } > + { bitsize_int (at * tree_to_uhwi (TYPE_SIZE (TREE_TYPE (type)))); }) > + (if (changed) > + (vec_perm { op0; } { op1; } { op2; })))))))))) > Index: gcc/testsuite/gcc.target/i386/pr90510.c > =================================================================== > --- gcc/testsuite/gcc.target/i386/pr90510.c (revision 0) > +++ gcc/testsuite/gcc.target/i386/pr90510.c (working copy) > @@ -0,0 +1,22 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -msse2 -fdump-tree-optimized" } */ > + > +typedef double __v2df __attribute__ ((__vector_size__ (16))); > +typedef long long __v2di __attribute__ ((__vector_size__ (16))); > + > +__v2df > +_mm_add_sd_A (__v2df x, __v2df y) > +{ > + double tem = x[0] + y[0]; > + return __builtin_shuffle ( x, (__v2df) { tem, tem }, (__v2di) { 2, 1 } ); > +} > + > +__v2df > +_mm_add_sd_B (__v2df x, __v2df y) > +{ > + __v2df z = { (x[0] + y[0]), x[1] }; > + return z; > +} > + > +/* { dg-final { scan-tree-dump-times "BIT_INSERT_EXPR" 2 "optimized" } } */ > +/* { dg-final { scan-assembler-not "unpck" } } */