https://gcc.gnu.org/g:eee2891312a9b42acabcc82739604c9fa8421757

commit r15-6387-geee2891312a9b42acabcc82739604c9fa8421757
Author: Christoph Müllner <christoph.muell...@vrull.eu>
Date:   Thu Dec 5 20:39:25 2024 +0100

    forwprop: Fix lane handling for VEC_PERM sequence blending
    
    In PR117830 a miscompilation of 464.h264ref was reported.
    An analysis showed that wrong code was generated because of
    unsatisfied assumptions.  This patch addresses these issues.
    
    The first assumption was that we could independently analyze the two
    vec-perms at the start of a vec-perm-simplify sequence and use the
    information  later for calculating a final vec-perm selector that
    utilizes fewer lanes.  However, this information does not help much,
    because for changing the selector entry, we need to ensure that both
    elements of the operand vectors v_1 and v_2 remain equal.
    This is addressed by removing the function get_vect_selector_index_map
    and checking for this equality in the loop where we create the new
    selector.
    
    The calculation of the selector vector for the blended sequence
    assumed that the indices of the selector vector of the narrowed
    sequences are increasing.  This assumption does not hold in general.
    This was fixed by allowing a wrap-around when searching for an empty
    lane.
    
    Further, there was an issue in the calculation of the selector vector
    entries for the second sequence.  The code did not consider that the
    lanes of the second sequence could have been moved.
    
    A relevant property of this patch is that it introduces a
    couple of nested loops, where the out loop iterates from
    i=0..nelts and the inner loop iterates from j=0..i.
    To avoid performance concerns, a check is introduced that
    ensures nelts won't exceed 4 lanes.
    
    The added test case is derived from h264ref (the other cases from the
    benchmark have the same structure and don't provide additional coverage).
    
    Bootstrapped and regression-tested on x86-64 and aarch64.
    Further, tested on CPU 2006 h264ref and CPU 2017 x264.
    
            PR117830
    
    gcc/ChangeLog:
    
            * tree-ssa-forwprop.cc (get_vect_selector_index_map): Removed.
            (recognise_vec_perm_simplify_seq): Fix calculation of vec-perm
            selectors of narrowed sequence.
            (calc_perm_vec_perm_simplify_seqs): Fixing calculation of
            vec-perm selectors of the blended sequence.
            (process_vec_perm_simplify_seq_list): Add whitespace to dump
            string to avoid bad formatted dump output.
    
    gcc/testsuite/ChangeLog:
    
            * gcc.dg/tree-ssa/vector-11.c: New test.
    
    Signed-off-by: Christoph Müllner <christoph.muell...@vrull.eu>

Diff:
---
 gcc/testsuite/gcc.dg/tree-ssa/vector-11.c |  38 ++++++
 gcc/tree-ssa-forwprop.cc                  | 203 ++++++++++++++++++------------
 2 files changed, 162 insertions(+), 79 deletions(-)

diff --git a/gcc/testsuite/gcc.dg/tree-ssa/vector-11.c 
b/gcc/testsuite/gcc.dg/tree-ssa/vector-11.c
new file mode 100644
index 000000000000..e4102d318d29
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/vector-11.c
@@ -0,0 +1,38 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O3 -fdump-tree-forwprop1-details -Wno-psabi" } */
+
+typedef int vec __attribute__((vector_size (4 * sizeof (int))));
+
+void f1 (vec *p_v_in, vec *p_v_out_1, vec *p_v_out_2)
+{
+  vec sel00 = { 2, 3, 2, 2 };
+  vec sel01 = { 1, 0, 1, 1 };
+  vec sel10 = { 3, 2, 3, 3 };
+  vec sel11 = { 0, 1, 0, 0 };
+  vec sel = { 0, 5, 2, 7 };
+  vec v_1, v_2, v_x, v_y, v_out_1, v_out_2;
+  vec v_in = *p_v_in;
+
+  /* First vec perm sequence.  */
+  v_1 = __builtin_shuffle (v_in, v_in, sel00);
+  v_2 = __builtin_shuffle (v_in, v_in, sel01);
+  v_x = v_2 - v_1;
+  v_y = v_1 + v_2;
+  v_out_1 = __builtin_shuffle (v_y, v_x, sel);
+
+  /* Second vec perm sequence.  */
+  v_1 = __builtin_shuffle (v_in, v_in, sel10);
+  v_2 = __builtin_shuffle (v_in, v_in, sel11);
+  v_x = v_2 - v_1;
+  v_y = v_1 + v_2;
+  v_out_2 = __builtin_shuffle (v_y, v_x, sel);
+
+  /* Won't blend because the narrowed sequence
+     utilizes three of the four lanes.  */
+
+  *p_v_out_1 = v_out_1;
+  *p_v_out_2 = v_out_2;
+}
+
+/* { dg-final { scan-tree-dump "Vec perm simplify sequences have been blended" 
"forwprop1" { target { aarch64*-*-* i?86-*-* x86_64-*-* } } } } */
+/* { dg-final { scan-tree-dump "VEC_PERM_EXPR.*{ 2, 7, 2, 6 }" "forwprop1" { 
target { aarch64*-*-* i?86-*-* x86_64-*-* } } } } */
diff --git a/gcc/tree-ssa-forwprop.cc b/gcc/tree-ssa-forwprop.cc
index 7cae08f0d798..dae8c2f435bc 100644
--- a/gcc/tree-ssa-forwprop.cc
+++ b/gcc/tree-ssa-forwprop.cc
@@ -3479,41 +3479,6 @@ fwprop_ssa_val (tree name)
   return name;
 }
 
-/* Get an index map from the provided vector permute selector
-   and return the number of unique indices.
-   E.g.: { 1, 3, 1, 3 } -> <0, 1, 0, 1>, 2
-        { 0, 2, 0, 2 } -> <0, 1, 0, 1>, 2
-        { 3, 2, 1, 0 } -> <0, 1, 2, 3>, 4.  */
-
-static unsigned int
-get_vect_selector_index_map (tree sel, vec<unsigned int> *index_map)
-{
-  gcc_assert (VECTOR_CST_NELTS (sel).is_constant ());
-  unsigned int nelts = VECTOR_CST_NELTS (sel).to_constant ();
-  unsigned int n = 0;
-
-  for (unsigned int i = 0; i < nelts; i++)
-    {
-      /* Extract the i-th value from the selector.  */
-      tree sel_cst_tree = VECTOR_CST_ELT (sel, i);
-      unsigned int sel_cst = TREE_INT_CST_LOW (sel_cst_tree);
-
-      unsigned int j = 0;
-      for (; j <= i; j++)
-       {
-         tree prev_sel_cst_tree = VECTOR_CST_ELT (sel, j);
-         unsigned int prev_sel_cst
-           = TREE_INT_CST_LOW (prev_sel_cst_tree);
-         if (prev_sel_cst == sel_cst)
-           break;
-       }
-      index_map->quick_push (j);
-      n += (i == j) ? 1 : 0;
-    }
-
-  return n;
-}
-
 /* Search for opportunities to free half of the lanes in the following pattern:
 
      v_in = {e0, e1, e2, e3}
@@ -3562,6 +3527,10 @@ recognise_vec_perm_simplify_seq (gassign *stmt, 
vec_perm_simplify_seq *seq)
 
   unsigned int nelts = VECTOR_CST_NELTS (sel).to_constant ();
 
+  /* Don't analyse sequences with many lanes.  */
+  if (nelts > 4)
+    return false;
+
   /* Lookup the definition of v_x and v_y.  */
   gassign *v_x_stmt = dyn_cast<gassign *> (SSA_NAME_DEF_STMT (v_x));
   gassign *v_y_stmt = dyn_cast<gassign *> (SSA_NAME_DEF_STMT (v_y));
@@ -3632,42 +3601,47 @@ recognise_vec_perm_simplify_seq (gassign *stmt, 
vec_perm_simplify_seq *seq)
       || nelts != VECTOR_CST_NELTS (v_2_sel).to_constant ())
     return false;
 
-  /* Now check permutation selection operands.  */
-  auto_vec<unsigned int> v_1_lane_map, v_2_lane_map;
-  v_1_lane_map.reserve (nelts);
-  v_2_lane_map.reserve (nelts);
-  unsigned int v_1_lanes, v_2_lanes;
-  v_1_lanes = get_vect_selector_index_map (v_1_sel, &v_1_lane_map);
-  v_2_lanes = get_vect_selector_index_map (v_2_sel, &v_2_lane_map);
-
-  /* Check if we could free up half of the lanes.  */
-  if (v_1_lanes != v_2_lanes || v_1_lanes > (nelts / 2))
-    return false;
-
   /* Create the new selector.  */
   vec_perm_builder new_sel_perm (nelts, nelts, 1);
+  auto_vec<unsigned int> lanes (nelts);
+  lanes.quick_grow_cleared (nelts);
   for (unsigned int i = 0; i < nelts; i++)
     {
       /* Extract the i-th value from the selector.  */
-      tree sel_cst_tree = VECTOR_CST_ELT (sel, i);
-      unsigned int sel_cst = TREE_INT_CST_LOW (sel_cst_tree);
-
-      unsigned int j;
-      if (sel_cst < nelts)
-       j = v_1_lane_map[sel_cst];
-      else
-       j = v_2_lane_map[sel_cst - nelts] + nelts;
+      unsigned int sel_cst = TREE_INT_CST_LOW (VECTOR_CST_ELT (sel, i));
+      unsigned int lane = sel_cst % nelts;
+      unsigned int offs = sel_cst / nelts;
 
-      new_sel_perm.quick_push (j);
+      /* Check what's in the lane.  */
+      unsigned int e_1 = TREE_INT_CST_LOW (VECTOR_CST_ELT (v_1_sel, lane));
+      unsigned int e_2 = TREE_INT_CST_LOW (VECTOR_CST_ELT (v_2_sel, lane));
 
-      if (dump_file && (dump_flags & TDF_DETAILS))
+      /* Reuse previous lane (if any).  */
+      unsigned int l = 0;
+      for (; l < lane; l++)
        {
-         fprintf (dump_file, "%u", j);
-         if (i != (nelts -1))
-           fprintf (dump_file, ", ");
+         if ((TREE_INT_CST_LOW (VECTOR_CST_ELT (v_1_sel, l)) == e_1)
+             && (TREE_INT_CST_LOW (VECTOR_CST_ELT (v_2_sel, l)) == e_2))
+           break;
        }
+
+      /* Add to narrowed selector.  */
+      new_sel_perm.quick_push (l + offs * nelts);
+
+      /* Mark lane as used.  */
+      lanes[l] = 1;
     }
 
+  /* Count how many lanes are need.  */
+  unsigned int cnt = 0;
+  for (unsigned int i = 0; i < nelts; i++)
+    cnt += lanes[i];
+
+  /* If more than (nelts/2) lanes are needed, skip the sequence.  */
+  if (cnt > nelts / 2)
+    return false;
+
+  /* Check if the resulting permuation is cheap.  */
   vec_perm_indices new_indices (new_sel_perm, 2, nelts);
   tree vectype = TREE_TYPE (gimple_assign_lhs (stmt));
   machine_mode vmode = TYPE_MODE (vectype);
@@ -3685,8 +3659,15 @@ recognise_vec_perm_simplify_seq (gassign *stmt, 
vec_perm_simplify_seq *seq)
 
   if (dump_file)
     {
-      fprintf (dump_file, "Found vec perm simplify sequence ending with: ");
+      fprintf (dump_file, "Found vec perm simplify sequence ending with:\n\t");
       print_gimple_stmt (dump_file, stmt, 0);
+
+      if (dump_flags & TDF_DETAILS)
+       {
+         fprintf (dump_file, "\tNarrowed vec_perm selector: ");
+         print_generic_expr (dump_file, (*seq)->new_sel);
+         fprintf (dump_file, "\n");
+       }
     }
 
   return true;
@@ -3805,29 +3786,66 @@ calc_perm_vec_perm_simplify_seqs (vec_perm_simplify_seq 
seq1,
   unsigned int i;
   unsigned int nelts = seq1->nelts;
   auto_vec<int> lane_assignment;
-  lane_assignment.create (2 * nelts);
+  lane_assignment.create (nelts);
 
   /* Mark all lanes as free.  */
-  lane_assignment.quick_grow_cleared (2 * nelts);
+  lane_assignment.quick_grow_cleared (nelts);
 
-  /* Reserve lanes for seq1.  */
+  /* Allocate lanes for seq1.  */
   for (i = 0; i < nelts; i++)
     {
       unsigned int l = TREE_INT_CST_LOW (VECTOR_CST_ELT (seq1->new_sel, i));
+      l %= nelts;
       lane_assignment[l] = 1;
-    }
+}
 
-  /* Reserve lanes for seq2 and calculate selector for seq2->stmt.  */
+  /* Allocate lanes for seq2 and calculate selector for seq2->stmt.  */
   vec_perm_builder seq2_stmt_sel_perm (nelts, nelts, 1);
   for (i = 0; i < nelts; i++)
     {
-      unsigned int l = TREE_INT_CST_LOW (VECTOR_CST_ELT (seq2->new_sel, i));
-      while (lane_assignment[l] != 0)
-       l++;
-      lane_assignment[l] = 2;
-      seq2_stmt_sel_perm.quick_push (l);
+      unsigned int sel = TREE_INT_CST_LOW (VECTOR_CST_ELT (seq2->new_sel, i));
+      unsigned int lane = sel % nelts;
+      unsigned int offs = sel / nelts;
+      unsigned int new_sel;
+
+      /* Check if we already allocated the lane for seq2.  */
+      unsigned int j = 0;
+      for (; j < i; j++)
+       {
+         unsigned int sel_old;
+         sel_old = TREE_INT_CST_LOW (VECTOR_CST_ELT (seq2->new_sel, j));
+         unsigned int lane_old = sel_old % nelts;
+         if (lane == lane_old)
+           {
+             new_sel = seq2_stmt_sel_perm[j].to_constant ();
+             new_sel = (new_sel % nelts) + offs * nelts;
+             break;
+           }
+       }
+
+      /* If the lane is not allocated, we need to do that now.  */
+      if (j == i)
+       {
+         unsigned int l_orig = lane;
+         while (lane_assignment[lane] != 0)
+           {
+             lane = (lane + 1) % nelts;
+
+             /* This should not happen if both sequences utilize no more than
+                half of the lanes.  Test anyway to guarantee termination.  */
+             if (lane == l_orig)
+               return false;
+           }
+
+         /* Allocate lane.  */
+         lane_assignment[lane] = 2;
+         new_sel = lane + offs * nelts;
+       }
+
+      seq2_stmt_sel_perm.quick_push (new_sel);
     }
 
+  /* Check if the resulting permuation is cheap.  */
   seq2_stmt_indices->new_vector (seq2_stmt_sel_perm, 2, nelts);
   tree vectype = TREE_TYPE (gimple_assign_lhs (seq2->stmt));
   machine_mode vmode = TYPE_MODE (vectype);
@@ -3839,13 +3857,40 @@ calc_perm_vec_perm_simplify_seqs (vec_perm_simplify_seq 
seq1,
   vec_perm_builder seq1_v_2_stmt_sel_perm (nelts, nelts, 1);
   for (i = 0; i < nelts; i++)
     {
-      bool use_seq1 = lane_assignment[i] == 1;
-      tree s1 = gimple_assign_rhs3 (use_seq1 ? seq1->v_1_stmt
-                                            : seq2->v_1_stmt);
-      tree s2 = gimple_assign_rhs3 (use_seq1 ? seq1->v_2_stmt
-                                            : seq2->v_2_stmt);
-      unsigned int l1 = TREE_INT_CST_LOW (VECTOR_CST_ELT (s1, i)) % nelts;
-      unsigned int l2 = TREE_INT_CST_LOW (VECTOR_CST_ELT (s2, i)) % nelts;
+      bool use_seq1 = lane_assignment[i] != 2;
+      unsigned int l1, l2;
+
+      if (use_seq1)
+       {
+         /* Just reuse the selector indices.  */
+         tree s1 = gimple_assign_rhs3 (seq1->v_1_stmt);
+         tree s2 = gimple_assign_rhs3 (seq1->v_2_stmt);
+         l1 = TREE_INT_CST_LOW (VECTOR_CST_ELT (s1, i));
+         l2 = TREE_INT_CST_LOW (VECTOR_CST_ELT (s2, i));
+       }
+      else
+       {
+         /* We moved the lanes for seq2, so we need to adjust for that.  */
+         tree s1 = gimple_assign_rhs3 (seq2->v_1_stmt);
+         tree s2 = gimple_assign_rhs3 (seq2->v_2_stmt);
+
+         unsigned int j = 0;
+         for (; j < i; j++)
+           {
+             unsigned int sel_new;
+             sel_new = seq2_stmt_sel_perm[j].to_constant ();
+             sel_new %= nelts;
+             if (sel_new == i)
+               break;
+           }
+
+         /* This should not happen.  Test anyway to guarantee correctness.  */
+         if (j == i)
+           return false;
+
+         l1 = TREE_INT_CST_LOW (VECTOR_CST_ELT (s1, j));
+         l2 = TREE_INT_CST_LOW (VECTOR_CST_ELT (s2, j));
+       }
 
       seq1_v_1_stmt_sel_perm.quick_push (l1 + (use_seq1 ? 0 : nelts));
       seq1_v_2_stmt_sel_perm.quick_push (l2 + (use_seq1 ? 0 : nelts));
@@ -3960,7 +4005,7 @@ process_vec_perm_simplify_seq_list 
(vec<vec_perm_simplify_seq> *l)
     return;
 
   if (dump_file && (dump_flags & TDF_DETAILS))
-    fprintf (dump_file, "Processing %u vec perm simplify sequences.\n",
+    fprintf (dump_file, "\nProcessing %u vec perm simplify sequences.\n",
             l->length ());
 
   FOR_EACH_VEC_ELT (*l, i, seq1)

Reply via email to