> Am 19.12.2024 um 21:47 schrieb Christoph Müllner
> <christoph.muell...@vrull.eu>:
>
> In PR117830 a miscompilation of 464.h264ref was reported.
> An analysis showed that wrong code was generated because of
> unsatisfied assuptions in the code. This patch addresses
> these issues.
>
> The first assuption was that we can independently analyse the two
> vec-perms at the start of a vec-perm-simplify sequence and use the
> information later for calculating a final vec-perm selector that
> utilizes less lanes. However, this information does not help much,
> because for changing the selector entry, we need to ensure that both
> elements of the operand vectors v_1 and v_2 remain equal.
> This is addressed by removing the function get_vect_selector_index_map
> and checking for this equality in the loop where we create the new
> selector.
>
> The calculation of the selector vector for the blended sequence
> assumed that the indices of the selector vector of the narrowed
> sequences are increasing. This assuption does not hold in general.
> This was fixed by allowing a wrap-around when searching for an empty
> lane.
>
> Further, there was an issue in the calculation of the selector vector
> entries for the second sequence. The code did not consider that the
> lanes of the second sequence could have been moved.
>
> A relevant property of this patch is, that it introduces a
> couple of nested loops, where the out loop iterates from
> i=0..nelts and the inner loop iterates from j=0..i.
> To avoid performance concerns, a check is introduced that
> ensures nelts won't exceed 4 lanes.
>
> The added test case is derived from h264ref (the other cases from the
> benchmark have the same structure and don't provide additional coverage).
>
> Bootstrapped and regression-tested on x86-64 and aarch64.
> Further, tested on CPU 2006 h264ref and CPU 2017 x264.
Ok
Thanks,
Richard
> PR117830
>
> gcc/ChangeLog:
>
> * tree-ssa-forwprop.cc (get_vect_selector_index_map):
> (recognise_vec_perm_simplify_seq):
> (calc_perm_vec_perm_simplify_seqs):
> (process_vec_perm_simplify_seq_list):
>
> gcc/testsuite/ChangeLog:
>
> * gcc.dg/tree-ssa/vector-11.c: New test.
>
> Signed-off-by: Christoph Müllner <christoph.muell...@vrull.eu>
> ---
> gcc/testsuite/gcc.dg/tree-ssa/vector-11.c | 38 ++++
> gcc/tree-ssa-forwprop.cc | 203 +++++++++++++---------
> 2 files changed, 162 insertions(+), 79 deletions(-)
> create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/vector-11.c
>
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/vector-11.c
> b/gcc/testsuite/gcc.dg/tree-ssa/vector-11.c
> new file mode 100644
> index 00000000000..e4102d318d2
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/vector-11.c
> @@ -0,0 +1,38 @@
> +/* { dg-do compile } */
> +/* { dg-additional-options "-O3 -fdump-tree-forwprop1-details -Wno-psabi" }
> */
> +
> +typedef int vec __attribute__((vector_size (4 * sizeof (int))));
> +
> +void f1 (vec *p_v_in, vec *p_v_out_1, vec *p_v_out_2)
> +{
> + vec sel00 = { 2, 3, 2, 2 };
> + vec sel01 = { 1, 0, 1, 1 };
> + vec sel10 = { 3, 2, 3, 3 };
> + vec sel11 = { 0, 1, 0, 0 };
> + vec sel = { 0, 5, 2, 7 };
> + vec v_1, v_2, v_x, v_y, v_out_1, v_out_2;
> + vec v_in = *p_v_in;
> +
> + /* First vec perm sequence. */
> + v_1 = __builtin_shuffle (v_in, v_in, sel00);
> + v_2 = __builtin_shuffle (v_in, v_in, sel01);
> + v_x = v_2 - v_1;
> + v_y = v_1 + v_2;
> + v_out_1 = __builtin_shuffle (v_y, v_x, sel);
> +
> + /* Second vec perm sequence. */
> + v_1 = __builtin_shuffle (v_in, v_in, sel10);
> + v_2 = __builtin_shuffle (v_in, v_in, sel11);
> + v_x = v_2 - v_1;
> + v_y = v_1 + v_2;
> + v_out_2 = __builtin_shuffle (v_y, v_x, sel);
> +
> + /* Won't blend because the narrowed sequence
> + utilizes three of the four lanes. */
> +
> + *p_v_out_1 = v_out_1;
> + *p_v_out_2 = v_out_2;
> +}
> +
> +/* { dg-final { scan-tree-dump "Vec perm simplify sequences have been
> blended" "forwprop1" { target { aarch64*-*-* i?86-*-* x86_64-*-* } } } } */
> +/* { dg-final { scan-tree-dump "VEC_PERM_EXPR.*{ 2, 7, 2, 6 }" "forwprop1" {
> target { aarch64*-*-* i?86-*-* x86_64-*-* } } } } */
> diff --git a/gcc/tree-ssa-forwprop.cc b/gcc/tree-ssa-forwprop.cc
> index 7cae08f0d79..dae8c2f435b 100644
> --- a/gcc/tree-ssa-forwprop.cc
> +++ b/gcc/tree-ssa-forwprop.cc
> @@ -3479,41 +3479,6 @@ fwprop_ssa_val (tree name)
> return name;
> }
>
> -/* Get an index map from the provided vector permute selector
> - and return the number of unique indices.
> - E.g.: { 1, 3, 1, 3 } -> <0, 1, 0, 1>, 2
> - { 0, 2, 0, 2 } -> <0, 1, 0, 1>, 2
> - { 3, 2, 1, 0 } -> <0, 1, 2, 3>, 4. */
> -
> -static unsigned int
> -get_vect_selector_index_map (tree sel, vec<unsigned int> *index_map)
> -{
> - gcc_assert (VECTOR_CST_NELTS (sel).is_constant ());
> - unsigned int nelts = VECTOR_CST_NELTS (sel).to_constant ();
> - unsigned int n = 0;
> -
> - for (unsigned int i = 0; i < nelts; i++)
> - {
> - /* Extract the i-th value from the selector. */
> - tree sel_cst_tree = VECTOR_CST_ELT (sel, i);
> - unsigned int sel_cst = TREE_INT_CST_LOW (sel_cst_tree);
> -
> - unsigned int j = 0;
> - for (; j <= i; j++)
> - {
> - tree prev_sel_cst_tree = VECTOR_CST_ELT (sel, j);
> - unsigned int prev_sel_cst
> - = TREE_INT_CST_LOW (prev_sel_cst_tree);
> - if (prev_sel_cst == sel_cst)
> - break;
> - }
> - index_map->quick_push (j);
> - n += (i == j) ? 1 : 0;
> - }
> -
> - return n;
> -}
> -
> /* Search for opportunities to free half of the lanes in the following
> pattern:
>
> v_in = {e0, e1, e2, e3}
> @@ -3562,6 +3527,10 @@ recognise_vec_perm_simplify_seq (gassign *stmt,
> vec_perm_simplify_seq *seq)
>
> unsigned int nelts = VECTOR_CST_NELTS (sel).to_constant ();
>
> + /* Don't analyse sequences with many lanes. */
> + if (nelts > 4)
> + return false;
> +
> /* Lookup the definition of v_x and v_y. */
> gassign *v_x_stmt = dyn_cast<gassign *> (SSA_NAME_DEF_STMT (v_x));
> gassign *v_y_stmt = dyn_cast<gassign *> (SSA_NAME_DEF_STMT (v_y));
> @@ -3632,42 +3601,47 @@ recognise_vec_perm_simplify_seq (gassign *stmt,
> vec_perm_simplify_seq *seq)
> || nelts != VECTOR_CST_NELTS (v_2_sel).to_constant ())
> return false;
>
> - /* Now check permutation selection operands. */
> - auto_vec<unsigned int> v_1_lane_map, v_2_lane_map;
> - v_1_lane_map.reserve (nelts);
> - v_2_lane_map.reserve (nelts);
> - unsigned int v_1_lanes, v_2_lanes;
> - v_1_lanes = get_vect_selector_index_map (v_1_sel, &v_1_lane_map);
> - v_2_lanes = get_vect_selector_index_map (v_2_sel, &v_2_lane_map);
> -
> - /* Check if we could free up half of the lanes. */
> - if (v_1_lanes != v_2_lanes || v_1_lanes > (nelts / 2))
> - return false;
> -
> /* Create the new selector. */
> vec_perm_builder new_sel_perm (nelts, nelts, 1);
> + auto_vec<unsigned int> lanes (nelts);
> + lanes.quick_grow_cleared (nelts);
> for (unsigned int i = 0; i < nelts; i++)
> {
> /* Extract the i-th value from the selector. */
> - tree sel_cst_tree = VECTOR_CST_ELT (sel, i);
> - unsigned int sel_cst = TREE_INT_CST_LOW (sel_cst_tree);
> -
> - unsigned int j;
> - if (sel_cst < nelts)
> - j = v_1_lane_map[sel_cst];
> - else
> - j = v_2_lane_map[sel_cst - nelts] + nelts;
> + unsigned int sel_cst = TREE_INT_CST_LOW (VECTOR_CST_ELT (sel, i));
> + unsigned int lane = sel_cst % nelts;
> + unsigned int offs = sel_cst / nelts;
>
> - new_sel_perm.quick_push (j);
> + /* Check what's in the lane. */
> + unsigned int e_1 = TREE_INT_CST_LOW (VECTOR_CST_ELT (v_1_sel, lane));
> + unsigned int e_2 = TREE_INT_CST_LOW (VECTOR_CST_ELT (v_2_sel, lane));
>
> - if (dump_file && (dump_flags & TDF_DETAILS))
> + /* Reuse previous lane (if any). */
> + unsigned int l = 0;
> + for (; l < lane; l++)
> {
> - fprintf (dump_file, "%u", j);
> - if (i != (nelts -1))
> - fprintf (dump_file, ", ");
> + if ((TREE_INT_CST_LOW (VECTOR_CST_ELT (v_1_sel, l)) == e_1)
> + && (TREE_INT_CST_LOW (VECTOR_CST_ELT (v_2_sel, l)) == e_2))
> + break;
> }
> +
> + /* Add to narrowed selector. */
> + new_sel_perm.quick_push (l + offs * nelts);
> +
> + /* Mark lane as used. */
> + lanes[l] = 1;
> }
>
> + /* Count how many lanes are need. */
> + unsigned int cnt = 0;
> + for (unsigned int i = 0; i < nelts; i++)
> + cnt += lanes[i];
> +
> + /* If more than (nelts/2) lanes are needed, skip the sequence. */
> + if (cnt > nelts / 2)
> + return false;
> +
> + /* Check if the resulting permuation is cheap. */
> vec_perm_indices new_indices (new_sel_perm, 2, nelts);
> tree vectype = TREE_TYPE (gimple_assign_lhs (stmt));
> machine_mode vmode = TYPE_MODE (vectype);
> @@ -3685,8 +3659,15 @@ recognise_vec_perm_simplify_seq (gassign *stmt,
> vec_perm_simplify_seq *seq)
>
> if (dump_file)
> {
> - fprintf (dump_file, "Found vec perm simplify sequence ending with: ");
> + fprintf (dump_file, "Found vec perm simplify sequence ending
> with:\n\t");
> print_gimple_stmt (dump_file, stmt, 0);
> +
> + if (dump_flags & TDF_DETAILS)
> + {
> + fprintf (dump_file, "\tNarrowed vec_perm selector: ");
> + print_generic_expr (dump_file, (*seq)->new_sel);
> + fprintf (dump_file, "\n");
> + }
> }
>
> return true;
> @@ -3805,29 +3786,66 @@ calc_perm_vec_perm_simplify_seqs
> (vec_perm_simplify_seq seq1,
> unsigned int i;
> unsigned int nelts = seq1->nelts;
> auto_vec<int> lane_assignment;
> - lane_assignment.create (2 * nelts);
> + lane_assignment.create (nelts);
>
> /* Mark all lanes as free. */
> - lane_assignment.quick_grow_cleared (2 * nelts);
> + lane_assignment.quick_grow_cleared (nelts);
>
> - /* Reserve lanes for seq1. */
> + /* Allocate lanes for seq1. */
> for (i = 0; i < nelts; i++)
> {
> unsigned int l = TREE_INT_CST_LOW (VECTOR_CST_ELT (seq1->new_sel, i));
> + l %= nelts;
> lane_assignment[l] = 1;
> - }
> +}
>
> - /* Reserve lanes for seq2 and calculate selector for seq2->stmt. */
> + /* Allocate lanes for seq2 and calculate selector for seq2->stmt. */
> vec_perm_builder seq2_stmt_sel_perm (nelts, nelts, 1);
> for (i = 0; i < nelts; i++)
> {
> - unsigned int l = TREE_INT_CST_LOW (VECTOR_CST_ELT (seq2->new_sel, i));
> - while (lane_assignment[l] != 0)
> - l++;
> - lane_assignment[l] = 2;
> - seq2_stmt_sel_perm.quick_push (l);
> + unsigned int sel = TREE_INT_CST_LOW (VECTOR_CST_ELT (seq2->new_sel,
> i));
> + unsigned int lane = sel % nelts;
> + unsigned int offs = sel / nelts;
> + unsigned int new_sel;
> +
> + /* Check if we already allocated the lane for seq2. */
> + unsigned int j = 0;
> + for (; j < i; j++)
> + {
> + unsigned int sel_old;
> + sel_old = TREE_INT_CST_LOW (VECTOR_CST_ELT (seq2->new_sel, j));
> + unsigned int lane_old = sel_old % nelts;
> + if (lane == lane_old)
> + {
> + new_sel = seq2_stmt_sel_perm[j].to_constant ();
> + new_sel = (new_sel % nelts) + offs * nelts;
> + break;
> + }
> + }
> +
> + /* If the lane is not allocated, we need to do that now. */
> + if (j == i)
> + {
> + unsigned int l_orig = lane;
> + while (lane_assignment[lane] != 0)
> + {
> + lane = (lane + 1) % nelts;
> +
> + /* This should not happen if both sequences utilize no more than
> + half of the lanes. Test anyway to guarantee termination. */
> + if (lane == l_orig)
> + return false;
> + }
> +
> + /* Allocate lane. */
> + lane_assignment[lane] = 2;
> + new_sel = lane + offs * nelts;
> + }
> +
> + seq2_stmt_sel_perm.quick_push (new_sel);
> }
>
> + /* Check if the resulting permuation is cheap. */
> seq2_stmt_indices->new_vector (seq2_stmt_sel_perm, 2, nelts);
> tree vectype = TREE_TYPE (gimple_assign_lhs (seq2->stmt));
> machine_mode vmode = TYPE_MODE (vectype);
> @@ -3839,13 +3857,40 @@ calc_perm_vec_perm_simplify_seqs
> (vec_perm_simplify_seq seq1,
> vec_perm_builder seq1_v_2_stmt_sel_perm (nelts, nelts, 1);
> for (i = 0; i < nelts; i++)
> {
> - bool use_seq1 = lane_assignment[i] == 1;
> - tree s1 = gimple_assign_rhs3 (use_seq1 ? seq1->v_1_stmt
> - : seq2->v_1_stmt);
> - tree s2 = gimple_assign_rhs3 (use_seq1 ? seq1->v_2_stmt
> - : seq2->v_2_stmt);
> - unsigned int l1 = TREE_INT_CST_LOW (VECTOR_CST_ELT (s1, i)) % nelts;
> - unsigned int l2 = TREE_INT_CST_LOW (VECTOR_CST_ELT (s2, i)) % nelts;
> + bool use_seq1 = lane_assignment[i] != 2;
> + unsigned int l1, l2;
> +
> + if (use_seq1)
> + {
> + /* Just reuse the selector indices. */
> + tree s1 = gimple_assign_rhs3 (seq1->v_1_stmt);
> + tree s2 = gimple_assign_rhs3 (seq1->v_2_stmt);
> + l1 = TREE_INT_CST_LOW (VECTOR_CST_ELT (s1, i));
> + l2 = TREE_INT_CST_LOW (VECTOR_CST_ELT (s2, i));
> + }
> + else
> + {
> + /* We moved the lanes for seq2, so we need to adjust for that. */
> + tree s1 = gimple_assign_rhs3 (seq2->v_1_stmt);
> + tree s2 = gimple_assign_rhs3 (seq2->v_2_stmt);
> +
> + unsigned int j = 0;
> + for (; j < i; j++)
> + {
> + unsigned int sel_new;
> + sel_new = seq2_stmt_sel_perm[j].to_constant ();
> + sel_new %= nelts;
> + if (sel_new == i)
> + break;
> + }
> +
> + /* This should not happen. Test anyway to guarantee correctness. */
> + if (j == i)
> + return false;
> +
> + l1 = TREE_INT_CST_LOW (VECTOR_CST_ELT (s1, j));
> + l2 = TREE_INT_CST_LOW (VECTOR_CST_ELT (s2, j));
> + }
>
> seq1_v_1_stmt_sel_perm.quick_push (l1 + (use_seq1 ? 0 : nelts));
> seq1_v_2_stmt_sel_perm.quick_push (l2 + (use_seq1 ? 0 : nelts));
> @@ -3960,7 +4005,7 @@ process_vec_perm_simplify_seq_list
> (vec<vec_perm_simplify_seq> *l)
> return;
>
> if (dump_file && (dump_flags & TDF_DETAILS))
> - fprintf (dump_file, "Processing %u vec perm simplify sequences.\n",
> + fprintf (dump_file, "\nProcessing %u vec perm simplify sequences.\n",
> l->length ());
>
> FOR_EACH_VEC_ELT (*l, i, seq1)
> --
> 2.47.1
>