> Am 16.04.2025 um 23:37 schrieb Tamar Christina <tamar.christ...@arm.com>:
>
> Hi All,
>
> The following testcase shows an incorrect masked codegen:
>
> #define N 512
> #define START 1
> #define END 505
>
> int x[N] __attribute__((aligned(32)));
>
> int __attribute__((noipa))
> foo (void)
> {
> int z = 0;
> for (unsigned int i = START; i < END; ++i)
> {
> z++;
> if (x[i] > 0)
> continue;
>
> return z;
> }
> return -1;
> }
>
> notice how there's a continue there instead of a break. This means we
> generate
> a control flow where success stays within the loop iteration:
>
> mask_patt_9.12_46 = vect__1.11_45 > { 0, 0, 0, 0 };
> vec_mask_and_47 = mask_patt_9.12_46 & loop_mask_41;
> if (vec_mask_and_47 == { -1, -1, -1, -1 })
> goto <bb 4>; [41.48%]
> else
> goto <bb 15>; [58.52%]
>
> However when loop_mask_41 is a partial mask this comparison can lead to an
> incorrect match. In this case the mask is:
>
> # loop_mask_41 = PHI <next_mask_63(6), { 0, -1, -1, -1 }(2)>
>
> due to peeling for alignment with masking and compiling with
> -msve-vector-bits=128.
>
> At codegen time we generate:
>
> ptrue p15.s, vl4
> ptrue p7.b, vl1
> not p7.b, p15/z, p7.b
> .L5:
> ld1w z29.s, p7/z, [x1, x0, lsl 2]
> cmpgt p7.s, p7/z, z29.s, #0
> not p7.b, p15/z, p7.b
> ptest p15, p7.b
> b.none .L2
> ...<early exit>...
>
> Here the basic blocks are rotated and a not is generated.
> But the generated not is unmasked (or predicated over an ALL true mask in this
> case). This has the unintended side-effect of flipping the results of the
> inactive lanes (which were zero'd by the cmpgt) into -1. Which then
> incorrectly
> causes us to not take the branch to .L2.
>
> This is happening because we're not comparing against the right value for the
> forall case. This patch gets rid of the forall case by rewriting the
> if(all(mask)) into if (!all(mask)) which is the same as if (any(~mask)) by
> negating the masks and flipping the branches.
>
> 1. For unmasked loops we simply reduce the ~mask.
> 2. For masked loops we reduce (~mask & loop_mask) which is the same as
> doing (mask & loop_mask) ^ loop_mask.
>
> For the above we now generate:
>
> .L5:
> ld1w z28.s, p7/z, [x1, x0, lsl 2]
> cmple p7.s, p7/z, z28.s, #0
> ptest p15, p7.b
> b.none .L2
>
> This fixes gromacs with > 1 OpenMP threads and improves performance.
>
> Bootstrapped Regtested on aarch64-none-linux-gnu,
> arm-none-linux-gnueabihf, x86_64-pc-linux-gnu
> -m32, -m64 and no issues.
>
> Ok for master? and backport to GCC-14?
Ok
Thanks,
Richard
> Thanks,
> Tamar
>
> gcc/ChangeLog:
>
> PR tree-optimization/119351
> * tree-vect-stmts.cc (vectorizable_early_exit): Mask both operands of
> the gcond for partial masking support.
>
> gcc/testsuite/ChangeLog:
>
> PR tree-optimization/119351
> * gcc.target/aarch64/sve/pr119351.c: New test.
> * gcc.target/aarch64/sve/pr119351_run.c: New test.
>
> ---
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pr119351.c
> b/gcc/testsuite/gcc.target/aarch64/sve/pr119351.c
> new file mode 100644
> index
> 0000000000000000000000000000000000000000..85aab355f95f83e1fa65d280f14fb8ade7f7e658
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/pr119351.c
> @@ -0,0 +1,39 @@
> +/* Fix for PR119351 alignment peeling with vectors and VLS. */
> +/* { dg-do compile } */
> +/* { dg-options "-Ofast -msve-vector-bits=256 --param
> aarch64-autovec-preference=sve-only -fdump-tree-vect-details" } */
> +/* { dg-final { check-function-bodies "**" "" ""} } */
> +
> +#define N 512
> +#define START 1
> +#define END 505
> +
> +int x[N] __attribute__((aligned(32)));
> +
> +/*
> +** foo:
> +** ...
> +** ld1w z[0-9]+.s, p[0-9]+/z, \[x[0-9], x[0-9], lsl 2\]
> +** cmple p[0-9]+.s, p[0-9]+/z, z[0-9]+.s, #0
> +** ptest p[0-9]+, p[0-9]+.b
> +** ...
> +*/
> +
> +int __attribute__((noipa))
> +foo (void)
> +{
> + int z = 0;
> + for (unsigned int i = START; i < END; ++i)
> + {
> + z++;
> + if (x[i] > 0)
> + continue;
> +
> + return z;
> + }
> + return -1;
> +}
> +
> +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
> +/* { dg-final { scan-tree-dump "pfa_iv_offset" "vect" } } */
> +/* { dg-final { scan-tree-dump "Alignment of access forced using peeling"
> "vect" } } */
> +
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pr119351_run.c
> b/gcc/testsuite/gcc.target/aarch64/sve/pr119351_run.c
> new file mode 100644
> index
> 0000000000000000000000000000000000000000..d36ab0eb7a900504e7dc2266ec5a19d1beeb5123
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/pr119351_run.c
> @@ -0,0 +1,20 @@
> +/* Fix for PR119351 alignment peeling with vectors and VLS. */
> +/* { dg-do run { target aarch64_sve_hw } } */
> +/* { dg-options "-Ofast --param aarch64-autovec-preference=sve-only" } */
> +/* { dg-additional-options "-msve-vector-bits=256" { target
> aarch64_sve256_hw } } */
> +/* { dg-additional-options "-msve-vector-bits=128" { target
> aarch64_sve128_hw } } */
> +
> +#include "pr119351.c"
> +
> +int __attribute__ ((optimize (1)))
> +main (void)
> +{
> + x[0] = 1;
> + x[1] = 21;
> + x[2] = 39;
> + x[3] = 59;
> + int res = foo ();
> + if (res != 4)
> + __builtin_abort ();
> + return 0;
> +}
> diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> index
> 7f874354e75e8d4d3a7196df4e9b559ac641827c..5af1973734e20b22d81e1624933d8265448c1e1a
> 100644
> --- a/gcc/tree-vect-stmts.cc
> +++ b/gcc/tree-vect-stmts.cc
> @@ -13615,29 +13615,23 @@ vectorizable_early_exit (vec_info *vinfo,
> stmt_vec_info stmt_info,
> codegen so we must replace the original insn. */
> gimple *orig_stmt = STMT_VINFO_STMT (vect_orig_stmt (stmt_info));
> gcond *cond_stmt = as_a <gcond *>(orig_stmt);
> +
> + tree cst = build_zero_cst (vectype);
> + auto bb = gimple_bb (cond_stmt);
> + edge exit_true_edge = EDGE_SUCC (bb, 0);
> + if (exit_true_edge->flags & EDGE_FALSE_VALUE)
> + exit_true_edge = EDGE_SUCC (bb, 1);
> + gcc_assert (exit_true_edge->flags & EDGE_TRUE_VALUE);
> +
> /* When vectorizing we assume that if the branch edge is taken that we're
> exiting the loop. This is not however always the case as the compiler
> will
> rewrite conditions to always be a comparison against 0. To do this it
> sometimes flips the edges. This is fine for scalar, but for vector we
> - then have to flip the test, as we're still assuming that if you take the
> - branch edge that we found the exit condition. i.e. we need to know
> whether
> - we are generating a `forall` or an `exist` condition. */
> - auto new_code = NE_EXPR;
> - auto reduc_optab = ior_optab;
> - auto reduc_op = BIT_IOR_EXPR;
> - tree cst = build_zero_cst (vectype);
> - edge exit_true_edge = EDGE_SUCC (gimple_bb (cond_stmt), 0);
> - if (exit_true_edge->flags & EDGE_FALSE_VALUE)
> - exit_true_edge = EDGE_SUCC (gimple_bb (cond_stmt), 1);
> - gcc_assert (exit_true_edge->flags & EDGE_TRUE_VALUE);
> - if (flow_bb_inside_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
> - exit_true_edge->dest))
> - {
> - new_code = EQ_EXPR;
> - reduc_optab = and_optab;
> - reduc_op = BIT_AND_EXPR;
> - cst = build_minus_one_cst (vectype);
> - }
> + then have to negate the result of the test, as we're still assuming
> that if
> + you take the branch edge that we found the exit condition. i.e. we
> need to
> + know whether we are generating a `forall` or an `exist` condition. */
> + bool flipped = flow_bb_inside_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
> + exit_true_edge->dest);
>
> /* Analyze only. */
> if (!vec_stmt)
> @@ -13653,14 +13647,13 @@ vectorizable_early_exit (vec_info *vinfo,
> stmt_vec_info stmt_info,
> }
>
> if (ncopies > 1
> - && direct_optab_handler (reduc_optab, mode) == CODE_FOR_nothing)
> + && direct_optab_handler (ior_optab, mode) == CODE_FOR_nothing)
> {
> if (dump_enabled_p ())
> dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> "can't vectorize early exit because the "
> - "target does not support boolean vector %s "
> + "target does not support boolean vector IOR "
> "for type %T.\n",
> - reduc_optab == ior_optab ? "OR" : "AND",
> vectype);
> return false;
> }
> @@ -13720,6 +13713,29 @@ vectorizable_early_exit (vec_info *vinfo,
> stmt_vec_info stmt_info,
> stmts.quick_push (gimple_assign_lhs (stmt));
> }
>
> + /* If we're comparing against a previous forall we need to negate the
> resullts
> + before we do the final comparison or reduction. */
> + if (flipped)
> + {
> + /* Rewrite the if(all(mask)) into if (!all(mask)) which is the same as
> + if (any(~mask)) by negating the masks and flipping the branches.
> +
> + 1. For unmasked loops we simply reduce the ~mask.
> + 2. For masked loops we reduce (~mask & loop_mask) which is the same as
> + doing (mask & loop_mask) ^ loop_mask. */
> + for (unsigned i = 0; i < stmts.length (); i++)
> + {
> + tree inv_lhs = make_temp_ssa_name (vectype, NULL, "vexit_inv");
> + auto inv_stmt = gimple_build_assign (inv_lhs, BIT_NOT_EXPR, stmts[i]);
> + vect_finish_stmt_generation (loop_vinfo, stmt_info, inv_stmt,
> + &cond_gsi);
> + stmts[i] = inv_lhs;
> + }
> +
> + EDGE_SUCC (bb, 0)->flags ^= (EDGE_TRUE_VALUE|EDGE_FALSE_VALUE);
> + EDGE_SUCC (bb, 1)->flags ^= (EDGE_TRUE_VALUE|EDGE_FALSE_VALUE);
> + }
> +
> /* Determine if we need to reduce the final value. */
> if (stmts.length () > 1)
> {
> @@ -13758,7 +13774,7 @@ vectorizable_early_exit (vec_info *vinfo,
> stmt_vec_info stmt_info,
> new_temp = make_temp_ssa_name (vectype, NULL, "vexit_reduc");
> tree arg0 = workset.pop ();
> tree arg1 = workset.pop ();
> - new_stmt = gimple_build_assign (new_temp, reduc_op, arg0, arg1);
> + new_stmt = gimple_build_assign (new_temp, BIT_IOR_EXPR, arg0, arg1);
> vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt,
> &cond_gsi);
> workset.quick_insert (0, new_temp);
> @@ -13781,7 +13797,7 @@ vectorizable_early_exit (vec_info *vinfo,
> stmt_vec_info stmt_info,
>
> gcc_assert (new_temp);
>
> - gimple_cond_set_condition (cond_stmt, new_code, new_temp, cst);
> + gimple_cond_set_condition (cond_stmt, NE_EXPR, new_temp, cst);
> update_stmt (orig_stmt);
>
> if (slp_node)
>
>
> --
> <rb19386.patch>