> Am 19.12.2024 um 21:47 schrieb Christoph Müllner 
> <christoph.muell...@vrull.eu>:
> 
> In PR117830 a miscompilation of 464.h264ref was reported.
> An analysis showed that wrong code was generated because of
> unsatisfied assuptions in the code.  This patch addresses
> these issues.
> 
> The first assuption was that we can independently analyse the two
> vec-perms at the start of a vec-perm-simplify sequence and use the
> information  later for calculating a final vec-perm selector that
> utilizes less lanes.  However, this information does not help much,
> because for changing the selector entry, we need to ensure that both
> elements of the operand vectors v_1 and v_2 remain equal.
> This is addressed by removing the function get_vect_selector_index_map
> and checking for this equality in the loop where we create the new
> selector.
> 
> The calculation of the selector vector for the blended sequence
> assumed that the indices of the selector vector of the narrowed
> sequences are increasing.  This assuption does not hold in general.
> This was fixed by allowing a wrap-around when searching for an empty
> lane.
> 
> Further, there was an issue in the calculation of the selector vector
> entries for the second sequence.  The code did not consider that the
> lanes of the second sequence could have been moved.
> 
> A relevant property of this patch is, that it introduces a
> couple of nested loops, where the out loop iterates from
> i=0..nelts and the inner loop iterates from j=0..i.
> To avoid performance concerns, a check is introduced that
> ensures nelts won't exceed 4 lanes.
> 
> The added test case is derived from h264ref (the other cases from the
> benchmark have the same structure and don't provide additional coverage).
> 
> Bootstrapped and regression-tested on x86-64 and aarch64.
> Further, tested on CPU 2006 h264ref and CPU 2017 x264.

Ok

Thanks,
Richard 

>    PR117830
> 
> gcc/ChangeLog:
> 
>    * tree-ssa-forwprop.cc (get_vect_selector_index_map):
>    (recognise_vec_perm_simplify_seq):
>    (calc_perm_vec_perm_simplify_seqs):
>    (process_vec_perm_simplify_seq_list):
> 
> gcc/testsuite/ChangeLog:
> 
>    * gcc.dg/tree-ssa/vector-11.c: New test.
> 
> Signed-off-by: Christoph Müllner <christoph.muell...@vrull.eu>
> ---
> gcc/testsuite/gcc.dg/tree-ssa/vector-11.c |  38 ++++
> gcc/tree-ssa-forwprop.cc                  | 203 +++++++++++++---------
> 2 files changed, 162 insertions(+), 79 deletions(-)
> create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/vector-11.c
> 
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/vector-11.c 
> b/gcc/testsuite/gcc.dg/tree-ssa/vector-11.c
> new file mode 100644
> index 00000000000..e4102d318d2
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/vector-11.c
> @@ -0,0 +1,38 @@
> +/* { dg-do compile } */
> +/* { dg-additional-options "-O3 -fdump-tree-forwprop1-details -Wno-psabi" } 
> */
> +
> +typedef int vec __attribute__((vector_size (4 * sizeof (int))));
> +
> +void f1 (vec *p_v_in, vec *p_v_out_1, vec *p_v_out_2)
> +{
> +  vec sel00 = { 2, 3, 2, 2 };
> +  vec sel01 = { 1, 0, 1, 1 };
> +  vec sel10 = { 3, 2, 3, 3 };
> +  vec sel11 = { 0, 1, 0, 0 };
> +  vec sel = { 0, 5, 2, 7 };
> +  vec v_1, v_2, v_x, v_y, v_out_1, v_out_2;
> +  vec v_in = *p_v_in;
> +
> +  /* First vec perm sequence.  */
> +  v_1 = __builtin_shuffle (v_in, v_in, sel00);
> +  v_2 = __builtin_shuffle (v_in, v_in, sel01);
> +  v_x = v_2 - v_1;
> +  v_y = v_1 + v_2;
> +  v_out_1 = __builtin_shuffle (v_y, v_x, sel);
> +
> +  /* Second vec perm sequence.  */
> +  v_1 = __builtin_shuffle (v_in, v_in, sel10);
> +  v_2 = __builtin_shuffle (v_in, v_in, sel11);
> +  v_x = v_2 - v_1;
> +  v_y = v_1 + v_2;
> +  v_out_2 = __builtin_shuffle (v_y, v_x, sel);
> +
> +  /* Won't blend because the narrowed sequence
> +     utilizes three of the four lanes.  */
> +
> +  *p_v_out_1 = v_out_1;
> +  *p_v_out_2 = v_out_2;
> +}
> +
> +/* { dg-final { scan-tree-dump "Vec perm simplify sequences have been 
> blended" "forwprop1" { target { aarch64*-*-* i?86-*-* x86_64-*-* } } } } */
> +/* { dg-final { scan-tree-dump "VEC_PERM_EXPR.*{ 2, 7, 2, 6 }" "forwprop1" { 
> target { aarch64*-*-* i?86-*-* x86_64-*-* } } } } */
> diff --git a/gcc/tree-ssa-forwprop.cc b/gcc/tree-ssa-forwprop.cc
> index 7cae08f0d79..dae8c2f435b 100644
> --- a/gcc/tree-ssa-forwprop.cc
> +++ b/gcc/tree-ssa-forwprop.cc
> @@ -3479,41 +3479,6 @@ fwprop_ssa_val (tree name)
>   return name;
> }
> 
> -/* Get an index map from the provided vector permute selector
> -   and return the number of unique indices.
> -   E.g.: { 1, 3, 1, 3 } -> <0, 1, 0, 1>, 2
> -     { 0, 2, 0, 2 } -> <0, 1, 0, 1>, 2
> -     { 3, 2, 1, 0 } -> <0, 1, 2, 3>, 4.  */
> -
> -static unsigned int
> -get_vect_selector_index_map (tree sel, vec<unsigned int> *index_map)
> -{
> -  gcc_assert (VECTOR_CST_NELTS (sel).is_constant ());
> -  unsigned int nelts = VECTOR_CST_NELTS (sel).to_constant ();
> -  unsigned int n = 0;
> -
> -  for (unsigned int i = 0; i < nelts; i++)
> -    {
> -      /* Extract the i-th value from the selector.  */
> -      tree sel_cst_tree = VECTOR_CST_ELT (sel, i);
> -      unsigned int sel_cst = TREE_INT_CST_LOW (sel_cst_tree);
> -
> -      unsigned int j = 0;
> -      for (; j <= i; j++)
> -    {
> -      tree prev_sel_cst_tree = VECTOR_CST_ELT (sel, j);
> -      unsigned int prev_sel_cst
> -        = TREE_INT_CST_LOW (prev_sel_cst_tree);
> -      if (prev_sel_cst == sel_cst)
> -        break;
> -    }
> -      index_map->quick_push (j);
> -      n += (i == j) ? 1 : 0;
> -    }
> -
> -  return n;
> -}
> -
> /* Search for opportunities to free half of the lanes in the following 
> pattern:
> 
>      v_in = {e0, e1, e2, e3}
> @@ -3562,6 +3527,10 @@ recognise_vec_perm_simplify_seq (gassign *stmt, 
> vec_perm_simplify_seq *seq)
> 
>   unsigned int nelts = VECTOR_CST_NELTS (sel).to_constant ();
> 
> +  /* Don't analyse sequences with many lanes.  */
> +  if (nelts > 4)
> +    return false;
> +
>   /* Lookup the definition of v_x and v_y.  */
>   gassign *v_x_stmt = dyn_cast<gassign *> (SSA_NAME_DEF_STMT (v_x));
>   gassign *v_y_stmt = dyn_cast<gassign *> (SSA_NAME_DEF_STMT (v_y));
> @@ -3632,42 +3601,47 @@ recognise_vec_perm_simplify_seq (gassign *stmt, 
> vec_perm_simplify_seq *seq)
>       || nelts != VECTOR_CST_NELTS (v_2_sel).to_constant ())
>     return false;
> 
> -  /* Now check permutation selection operands.  */
> -  auto_vec<unsigned int> v_1_lane_map, v_2_lane_map;
> -  v_1_lane_map.reserve (nelts);
> -  v_2_lane_map.reserve (nelts);
> -  unsigned int v_1_lanes, v_2_lanes;
> -  v_1_lanes = get_vect_selector_index_map (v_1_sel, &v_1_lane_map);
> -  v_2_lanes = get_vect_selector_index_map (v_2_sel, &v_2_lane_map);
> -
> -  /* Check if we could free up half of the lanes.  */
> -  if (v_1_lanes != v_2_lanes || v_1_lanes > (nelts / 2))
> -    return false;
> -
>   /* Create the new selector.  */
>   vec_perm_builder new_sel_perm (nelts, nelts, 1);
> +  auto_vec<unsigned int> lanes (nelts);
> +  lanes.quick_grow_cleared (nelts);
>   for (unsigned int i = 0; i < nelts; i++)
>     {
>       /* Extract the i-th value from the selector.  */
> -      tree sel_cst_tree = VECTOR_CST_ELT (sel, i);
> -      unsigned int sel_cst = TREE_INT_CST_LOW (sel_cst_tree);
> -
> -      unsigned int j;
> -      if (sel_cst < nelts)
> -    j = v_1_lane_map[sel_cst];
> -      else
> -    j = v_2_lane_map[sel_cst - nelts] + nelts;
> +      unsigned int sel_cst = TREE_INT_CST_LOW (VECTOR_CST_ELT (sel, i));
> +      unsigned int lane = sel_cst % nelts;
> +      unsigned int offs = sel_cst / nelts;
> 
> -      new_sel_perm.quick_push (j);
> +      /* Check what's in the lane.  */
> +      unsigned int e_1 = TREE_INT_CST_LOW (VECTOR_CST_ELT (v_1_sel, lane));
> +      unsigned int e_2 = TREE_INT_CST_LOW (VECTOR_CST_ELT (v_2_sel, lane));
> 
> -      if (dump_file && (dump_flags & TDF_DETAILS))
> +      /* Reuse previous lane (if any).  */
> +      unsigned int l = 0;
> +      for (; l < lane; l++)
>    {
> -      fprintf (dump_file, "%u", j);
> -      if (i != (nelts -1))
> -        fprintf (dump_file, ", ");
> +      if ((TREE_INT_CST_LOW (VECTOR_CST_ELT (v_1_sel, l)) == e_1)
> +          && (TREE_INT_CST_LOW (VECTOR_CST_ELT (v_2_sel, l)) == e_2))
> +        break;
>    }
> +
> +      /* Add to narrowed selector.  */
> +      new_sel_perm.quick_push (l + offs * nelts);
> +
> +      /* Mark lane as used.  */
> +      lanes[l] = 1;
>     }
> 
> +  /* Count how many lanes are need.  */
> +  unsigned int cnt = 0;
> +  for (unsigned int i = 0; i < nelts; i++)
> +    cnt += lanes[i];
> +
> +  /* If more than (nelts/2) lanes are needed, skip the sequence.  */
> +  if (cnt > nelts / 2)
> +    return false;
> +
> +  /* Check if the resulting permuation is cheap.  */
>   vec_perm_indices new_indices (new_sel_perm, 2, nelts);
>   tree vectype = TREE_TYPE (gimple_assign_lhs (stmt));
>   machine_mode vmode = TYPE_MODE (vectype);
> @@ -3685,8 +3659,15 @@ recognise_vec_perm_simplify_seq (gassign *stmt, 
> vec_perm_simplify_seq *seq)
> 
>   if (dump_file)
>     {
> -      fprintf (dump_file, "Found vec perm simplify sequence ending with: ");
> +      fprintf (dump_file, "Found vec perm simplify sequence ending 
> with:\n\t");
>       print_gimple_stmt (dump_file, stmt, 0);
> +
> +      if (dump_flags & TDF_DETAILS)
> +    {
> +      fprintf (dump_file, "\tNarrowed vec_perm selector: ");
> +      print_generic_expr (dump_file, (*seq)->new_sel);
> +      fprintf (dump_file, "\n");
> +    }
>     }
> 
>   return true;
> @@ -3805,29 +3786,66 @@ calc_perm_vec_perm_simplify_seqs 
> (vec_perm_simplify_seq seq1,
>   unsigned int i;
>   unsigned int nelts = seq1->nelts;
>   auto_vec<int> lane_assignment;
> -  lane_assignment.create (2 * nelts);
> +  lane_assignment.create (nelts);
> 
>   /* Mark all lanes as free.  */
> -  lane_assignment.quick_grow_cleared (2 * nelts);
> +  lane_assignment.quick_grow_cleared (nelts);
> 
> -  /* Reserve lanes for seq1.  */
> +  /* Allocate lanes for seq1.  */
>   for (i = 0; i < nelts; i++)
>     {
>       unsigned int l = TREE_INT_CST_LOW (VECTOR_CST_ELT (seq1->new_sel, i));
> +      l %= nelts;
>       lane_assignment[l] = 1;
> -    }
> +}
> 
> -  /* Reserve lanes for seq2 and calculate selector for seq2->stmt.  */
> +  /* Allocate lanes for seq2 and calculate selector for seq2->stmt.  */
>   vec_perm_builder seq2_stmt_sel_perm (nelts, nelts, 1);
>   for (i = 0; i < nelts; i++)
>     {
> -      unsigned int l = TREE_INT_CST_LOW (VECTOR_CST_ELT (seq2->new_sel, i));
> -      while (lane_assignment[l] != 0)
> -    l++;
> -      lane_assignment[l] = 2;
> -      seq2_stmt_sel_perm.quick_push (l);
> +      unsigned int sel = TREE_INT_CST_LOW (VECTOR_CST_ELT (seq2->new_sel, 
> i));
> +      unsigned int lane = sel % nelts;
> +      unsigned int offs = sel / nelts;
> +      unsigned int new_sel;
> +
> +      /* Check if we already allocated the lane for seq2.  */
> +      unsigned int j = 0;
> +      for (; j < i; j++)
> +    {
> +      unsigned int sel_old;
> +      sel_old = TREE_INT_CST_LOW (VECTOR_CST_ELT (seq2->new_sel, j));
> +      unsigned int lane_old = sel_old % nelts;
> +      if (lane == lane_old)
> +        {
> +          new_sel = seq2_stmt_sel_perm[j].to_constant ();
> +          new_sel = (new_sel % nelts) + offs * nelts;
> +          break;
> +        }
> +    }
> +
> +      /* If the lane is not allocated, we need to do that now.  */
> +      if (j == i)
> +    {
> +      unsigned int l_orig = lane;
> +      while (lane_assignment[lane] != 0)
> +        {
> +          lane = (lane + 1) % nelts;
> +
> +          /* This should not happen if both sequences utilize no more than
> +         half of the lanes.  Test anyway to guarantee termination.  */
> +          if (lane == l_orig)
> +        return false;
> +        }
> +
> +      /* Allocate lane.  */
> +      lane_assignment[lane] = 2;
> +      new_sel = lane + offs * nelts;
> +    }
> +
> +      seq2_stmt_sel_perm.quick_push (new_sel);
>     }
> 
> +  /* Check if the resulting permuation is cheap.  */
>   seq2_stmt_indices->new_vector (seq2_stmt_sel_perm, 2, nelts);
>   tree vectype = TREE_TYPE (gimple_assign_lhs (seq2->stmt));
>   machine_mode vmode = TYPE_MODE (vectype);
> @@ -3839,13 +3857,40 @@ calc_perm_vec_perm_simplify_seqs 
> (vec_perm_simplify_seq seq1,
>   vec_perm_builder seq1_v_2_stmt_sel_perm (nelts, nelts, 1);
>   for (i = 0; i < nelts; i++)
>     {
> -      bool use_seq1 = lane_assignment[i] == 1;
> -      tree s1 = gimple_assign_rhs3 (use_seq1 ? seq1->v_1_stmt
> -                         : seq2->v_1_stmt);
> -      tree s2 = gimple_assign_rhs3 (use_seq1 ? seq1->v_2_stmt
> -                         : seq2->v_2_stmt);
> -      unsigned int l1 = TREE_INT_CST_LOW (VECTOR_CST_ELT (s1, i)) % nelts;
> -      unsigned int l2 = TREE_INT_CST_LOW (VECTOR_CST_ELT (s2, i)) % nelts;
> +      bool use_seq1 = lane_assignment[i] != 2;
> +      unsigned int l1, l2;
> +
> +      if (use_seq1)
> +    {
> +      /* Just reuse the selector indices.  */
> +      tree s1 = gimple_assign_rhs3 (seq1->v_1_stmt);
> +      tree s2 = gimple_assign_rhs3 (seq1->v_2_stmt);
> +      l1 = TREE_INT_CST_LOW (VECTOR_CST_ELT (s1, i));
> +      l2 = TREE_INT_CST_LOW (VECTOR_CST_ELT (s2, i));
> +    }
> +      else
> +    {
> +      /* We moved the lanes for seq2, so we need to adjust for that.  */
> +      tree s1 = gimple_assign_rhs3 (seq2->v_1_stmt);
> +      tree s2 = gimple_assign_rhs3 (seq2->v_2_stmt);
> +
> +      unsigned int j = 0;
> +      for (; j < i; j++)
> +        {
> +          unsigned int sel_new;
> +          sel_new = seq2_stmt_sel_perm[j].to_constant ();
> +          sel_new %= nelts;
> +          if (sel_new == i)
> +        break;
> +        }
> +
> +      /* This should not happen.  Test anyway to guarantee correctness.  */
> +      if (j == i)
> +        return false;
> +
> +      l1 = TREE_INT_CST_LOW (VECTOR_CST_ELT (s1, j));
> +      l2 = TREE_INT_CST_LOW (VECTOR_CST_ELT (s2, j));
> +    }
> 
>       seq1_v_1_stmt_sel_perm.quick_push (l1 + (use_seq1 ? 0 : nelts));
>       seq1_v_2_stmt_sel_perm.quick_push (l2 + (use_seq1 ? 0 : nelts));
> @@ -3960,7 +4005,7 @@ process_vec_perm_simplify_seq_list 
> (vec<vec_perm_simplify_seq> *l)
>     return;
> 
>   if (dump_file && (dump_flags & TDF_DETAILS))
> -    fprintf (dump_file, "Processing %u vec perm simplify sequences.\n",
> +    fprintf (dump_file, "\nProcessing %u vec perm simplify sequences.\n",
>         l->length ());
> 
>   FOR_EACH_VEC_ELT (*l, i, seq1)
> --
> 2.47.1
> 

Reply via email to