Hi!

On Tue, 21 May 2019 13:11:54 +0200 (CEST), Richard Biener <rguent...@suse.de> 
wrote:
> On Mon, 20 May 2019, Richard Biener wrote:
> > On Sun, 19 May 2019, Richard Sandiford wrote:
> > > Richard Biener <rguent...@suse.de> writes:
> > > > This adds, incrementally ontop of moving VEC_PERM_EXPR folding
> > > > to match.pd (but not incremental patch - sorry...), folding
> > > > of single-element insert permutations to BIT_INSERT_EXPR.

> And the following is what I applied after fixing all sign-compare
> issues.
> 
> Bootstraped and tested on x86_64-unknown-linux-gnu.

I'm seeing one regression, in 'gcc/testsuite/brig/brig.sum':

    @@ -126,7 +126,7 @@ PASS: packed.hsail.brig scan-tree-dump original "\\+ { 
15, 14, 13, 12, 11, 10, 9
    PASS: packed.hsail.brig scan-tree-dump gimple "= { 15, 15, 15, 15, 15, 15, 
15, 15, 15, 15, 15, 15, 15, 15, 15, 15 };[\n ]+[a-z0-9_]+ = [a-z0-9_]+ \\+ 
[a-z0-9_]+;"
    PASS: packed.hsail.brig scan-tree-dump gimple "VEC_PERM_EXPR <[a-z0-9_]+, 
[a-z0-9_]+, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }>;"
    PASS: packed.hsail.brig scan-tree-dump gimple "_[0-9]+ = q2 \\+ q3;"
    [-PASS:-]{+FAIL:+} packed.hsail.brig scan-tree-dump gimple "= VEC_PERM_EXPR 
<[a-z0-9_]+, new_output.[0-9]+_[0-9]+, { 16, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 
12, 13, 14, 15 }>;"
    PASS: packed.hsail.brig scan-tree-dump gimple "q4 = 
(VIEW_CONVERT_EXPR<uint128_t>\\()?s_output.[0-9]+(_[0-9]+)*\\)?;"
    PASS: packed.hsail.brig scan-tree-dump-times gimple "= 
__builtin___hsail_sat_add_u8" 64
    PASS: packed.hsail.brig scan-tree-dump gimple "= 
VIEW_CONVERT_EXPR<vector\\(8\\) signed 
short>\\((s_output.[0-9]+_[0-9]+|q8)\\);[\n ]+q9 = -_[0-9]+;[\n ]+"

The before vs. after tree dump changed as follows:

    --- packed.hsail.brig.005t.gimple   2019-05-22 14:29:48.810860260 +0200
    +++ packed.hsail.brig.005t.gimple   2019-05-22 14:31:32.936670016 +0200
    @@ -112,7 +112,7 @@
       _26 = q2 + q3;
       new_output.11 = _26;
       new_output.21_27 = new_output.11;
    -  _28 = VEC_PERM_EXPR <q4, new_output.21_27, { 16, 1, 2, 3, 4, 5, 6, 7, 8, 
9, 10, 11, 12, 13, 14, 15 }>;
    +  _28 = VEC_PERM_EXPR <new_output.21_27, q4, { 0, 17, 18, 19, 20, 21, 22, 
23, 24, 25, 26, 27, 28, 29, 30, 31 }>;
       s_output.12 = _28;
       q4 = s_output.12;
       _29 = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
    @@ -369,7 +369,7 @@
       vec_out.22_273 = vec_out.16;
       new_output.17 = vec_out.22_273;
       new_output.23_274 = new_output.17;
    -  _275 = VEC_PERM_EXPR <q8, new_output.23_274, { 16, 1, 2, 3, 4, 5, 6, 7, 
8, 9, 10, 11, 12, 13, 14, 15 }>;
    +  _275 = VEC_PERM_EXPR <new_output.23_274, q8, { 0, 17, 18, 19, 20, 21, 
22, 23, 24, 25, 26, 27, 28, 29, 30, 31 }>;
       s_output.18 = _275;
       q8 = s_output.18;
       _276 = VIEW_CONVERT_EXPR<vector(8) signed short>(q8);

Will it be OK to just commit the obvious patch to adjust the tree dump
scanning, or is something actually wrong?  (As you can tell, so far I
didn't look up what that actually means -- hoping you can tell from the
little bit of context?)


Grüße
 Thomas


> 2019-05-21  Richard Biener  <rguent...@suse.de>
> 
>       PR middle-end/90510
>       * fold-const.c (fold_read_from_vector): New function.
>       * fold-const.h (fold_read_from_vector): Declare.
>       * match.pd (VEC_PERM_EXPR): Build BIT_INSERT_EXPRs for
>       single-element insert permutations.  Canonicalize selector
>       further and fix issue with last commit.
> 
>       * gcc.target/i386/pr90510.c: New testcase.
> 
> Index: gcc/fold-const.c
> ===================================================================
> --- gcc/fold-const.c  (revision 271412)
> +++ gcc/fold-const.c  (working copy)
> @@ -13793,6 +13793,28 @@ fold_read_from_constant_string (tree exp
>    return NULL;
>  }
>  
> +/* Folds a read from vector element at IDX of vector ARG.  */
> +
> +tree
> +fold_read_from_vector (tree arg, poly_uint64 idx)
> +{
> +  unsigned HOST_WIDE_INT i;
> +  if (known_lt (idx, TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg)))
> +      && known_ge (idx, 0u)
> +      && idx.is_constant (&i))
> +    {
> +      if (TREE_CODE (arg) == VECTOR_CST)
> +     return VECTOR_CST_ELT (arg, i);
> +      else if (TREE_CODE (arg) == CONSTRUCTOR)
> +     {
> +       if (i >= CONSTRUCTOR_NELTS (arg))
> +         return build_zero_cst (TREE_TYPE (TREE_TYPE (arg)));
> +       return CONSTRUCTOR_ELT (arg, i)->value;
> +     }
> +    }
> +  return NULL_TREE;
> +}
> +
>  /* Return the tree for neg (ARG0) when ARG0 is known to be either
>     an integer constant, real, or fixed-point constant.
>  
> Index: gcc/fold-const.h
> ===================================================================
> --- gcc/fold-const.h  (revision 271412)
> +++ gcc/fold-const.h  (working copy)
> @@ -100,6 +100,7 @@ extern tree fold_bit_and_mask (tree, tre
>                              tree, enum tree_code, tree, tree,
>                              tree, enum tree_code, tree, tree, tree *);
>  extern tree fold_read_from_constant_string (tree);
> +extern tree fold_read_from_vector (tree, poly_uint64);
>  #if GCC_VEC_PERN_INDICES_H
>  extern tree fold_vec_perm (tree, tree, tree, const vec_perm_indices &);
>  #endif
> Index: gcc/match.pd
> ===================================================================
> --- gcc/match.pd      (revision 271412)
> +++ gcc/match.pd      (working copy)
> @@ -5406,6 +5406,11 @@ (define_operator_list COND_TERNARY
>              op0 = op1;
>              sel.rotate_inputs (1);
>            }
> +        else if (known_ge (poly_uint64 (sel[0]), nelts))
> +          {
> +            std::swap (op0, op1);
> +            sel.rotate_inputs (1);
> +          }
>           }
>         gassign *def;
>         tree cop0 = op0, cop1 = op1;
> @@ -5429,9 +5434,46 @@ (define_operator_list COND_TERNARY
>       (with
>        {
>       bool changed = (op0 == op1 && !single_arg);
> +     tree ins = NULL_TREE;
> +     unsigned at = 0;
> +
> +     /* See if the permutation is performing a single element
> +        insert from a CONSTRUCTOR or constant and use a BIT_INSERT_EXPR
> +        in that case.  But only if the vector mode is supported,
> +        otherwise this is invalid GIMPLE.  */
> +        if (TYPE_MODE (type) != BLKmode
> +         && (TREE_CODE (cop0) == VECTOR_CST
> +             || TREE_CODE (cop0) == CONSTRUCTOR
> +             || TREE_CODE (cop1) == VECTOR_CST
> +             || TREE_CODE (cop1) == CONSTRUCTOR))
> +          {
> +         if (sel.series_p (1, 1, nelts + 1, 1))
> +           {
> +             /* After canonicalizing the first elt to come from the
> +                first vector we only can insert the first elt from
> +                the first vector.  */
> +             at = 0;
> +             ins = fold_read_from_vector (cop0, 0);
> +             op0 = op1;
> +           }
> +         else
> +           {
> +             unsigned int encoded_nelts = sel.encoding ().encoded_nelts ();
> +             for (at = 0; at < encoded_nelts; ++at)
> +               if (maybe_ne (sel[at], at))
> +                 break;
> +             if (at < encoded_nelts && sel.series_p (at + 1, 1, at + 1, 1))
> +               {
> +                 if (known_lt (at, nelts))
> +                   ins = fold_read_from_vector (cop0, sel[at]);
> +                 else
> +                   ins = fold_read_from_vector (cop1, sel[at] - nelts);
> +               }
> +           }
> +       }
>  
>       /* Generate a canonical form of the selector.  */
> -     if (sel.encoding () != builder)
> +     if (!ins && sel.encoding () != builder)
>         {
>           /* Some targets are deficient and fail to expand a single
>              argument permutation while still allowing an equivalent
> @@ -5450,10 +5492,12 @@ (define_operator_list COND_TERNARY
>                    so use the preferred form.  */
>                 op2 = vec_perm_indices_to_tree (TREE_TYPE (op2), sel);
>             }
> -         /* Differences in the encoder do not necessarily mean
> -            differences in the resulting vector.  */
> -         changed = !operand_equal_p (op2, oldop2, 0);
> +         if (!operand_equal_p (op2, oldop2, 0))
> +           changed = true;
>         }
>        }
> -      (if (changed)
> -       (vec_perm { op0; } { op1; } { op2; })))))))))
> +      (if (ins)
> +       (bit_insert { op0; } { ins; }
> +         { bitsize_int (at * tree_to_uhwi (TYPE_SIZE (TREE_TYPE (type)))); })
> +       (if (changed)
> +        (vec_perm { op0; } { op1; } { op2; }))))))))))
> Index: gcc/testsuite/gcc.target/i386/pr90510.c
> ===================================================================
> --- gcc/testsuite/gcc.target/i386/pr90510.c   (revision 0)
> +++ gcc/testsuite/gcc.target/i386/pr90510.c   (working copy)
> @@ -0,0 +1,22 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -msse2 -fdump-tree-optimized" } */
> +
> +typedef double __v2df __attribute__ ((__vector_size__ (16)));
> +typedef long long __v2di __attribute__ ((__vector_size__ (16)));
> +
> +__v2df
> +_mm_add_sd_A (__v2df x, __v2df y)
> +{
> +  double tem = x[0] + y[0];
> +  return __builtin_shuffle ( x, (__v2df) { tem, tem }, (__v2di) { 2, 1 } );
> +}
> +
> +__v2df
> +_mm_add_sd_B (__v2df x, __v2df y)
> +{
> +  __v2df z = { (x[0] + y[0]), x[1] };
> +  return z;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "BIT_INSERT_EXPR" 2 "optimized" } } */
> +/* { dg-final { scan-assembler-not "unpck" } } */

Reply via email to