> Am 16.04.2025 um 23:37 schrieb Tamar Christina <tamar.christ...@arm.com>:
> 
> Hi All,
> 
> The following testcase shows an incorrect masked codegen:
> 
> #define N 512
> #define START 1
> #define END 505
> 
> int x[N] __attribute__((aligned(32)));
> 
> int __attribute__((noipa))
> foo (void)
> {
>  int z = 0;
>  for (unsigned int i = START; i < END; ++i)
>    {
>      z++;
>      if (x[i] > 0)
>        continue;
> 
>      return z;
>    }
>  return -1;
> }
> 
> notice how there's a continue there instead of a break.  This means we 
> generate
> a control flow where success stays within the loop iteration:
> 
>  mask_patt_9.12_46 = vect__1.11_45 > { 0, 0, 0, 0 };
>  vec_mask_and_47 = mask_patt_9.12_46 & loop_mask_41;
>  if (vec_mask_and_47 == { -1, -1, -1, -1 })
>    goto <bb 4>; [41.48%]
>  else
>    goto <bb 15>; [58.52%]
> 
> However when loop_mask_41 is a partial mask this comparison can lead to an
> incorrect match.  In this case the mask is:
> 
>  # loop_mask_41 = PHI <next_mask_63(6), { 0, -1, -1, -1 }(2)>
> 
> due to peeling for alignment with masking and compiling with
> -msve-vector-bits=128.
> 
> At codegen time we generate:
> 
>    ptrue   p15.s, vl4
>    ptrue   p7.b, vl1
>    not     p7.b, p15/z, p7.b
> .L5:
>    ld1w    z29.s, p7/z, [x1, x0, lsl 2]
>    cmpgt   p7.s, p7/z, z29.s, #0
>    not     p7.b, p15/z, p7.b
>    ptest   p15, p7.b
>    b.none  .L2
>    ...<early exit>...
> 
> Here the basic blocks are rotated and a not is generated.
> But the generated not is unmasked (or predicated over an ALL true mask in this
> case).  This has the unintended side-effect of flipping the results of the
> inactive lanes (which were zero'd by the cmpgt) into -1.  Which then 
> incorrectly
> causes us to not take the branch to .L2.
> 
> This is happening because we're not comparing against the right value for the
> forall case.  This patch gets rid of the forall case by rewriting the
> if(all(mask)) into if (!all(mask)) which is the same as if (any(~mask)) by
> negating the masks and flipping the branches.
> 
>    1. For unmasked loops we simply reduce the ~mask.
>    2. For masked loops we reduce (~mask & loop_mask) which is the same as
>       doing (mask & loop_mask) ^ loop_mask.   
> 
> For the above we now generate:
> 
> .L5:
>        ld1w    z28.s, p7/z, [x1, x0, lsl 2]
>        cmple   p7.s, p7/z, z28.s, #0
>        ptest   p15, p7.b
>        b.none  .L2
> 
> This fixes gromacs with > 1 OpenMP threads and improves performance.
> 
> Bootstrapped Regtested on aarch64-none-linux-gnu,
> arm-none-linux-gnueabihf, x86_64-pc-linux-gnu
> -m32, -m64 and no issues.
> 
> Ok for master? and backport to GCC-14?

Ok

Thanks,
Richard 

> Thanks,
> Tamar
> 
> gcc/ChangeLog:
> 
>    PR tree-optimization/119351
>    * tree-vect-stmts.cc (vectorizable_early_exit): Mask both operands of
>    the gcond for partial masking support.
> 
> gcc/testsuite/ChangeLog:
> 
>    PR tree-optimization/119351
>    * gcc.target/aarch64/sve/pr119351.c: New test.
>    * gcc.target/aarch64/sve/pr119351_run.c: New test.
> 
> ---
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pr119351.c 
> b/gcc/testsuite/gcc.target/aarch64/sve/pr119351.c
> new file mode 100644
> index 
> 0000000000000000000000000000000000000000..85aab355f95f83e1fa65d280f14fb8ade7f7e658
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/pr119351.c
> @@ -0,0 +1,39 @@
> +/* Fix for PR119351 alignment peeling with vectors and VLS.  */
> +/* { dg-do compile } */
> +/* { dg-options "-Ofast -msve-vector-bits=256 --param 
> aarch64-autovec-preference=sve-only -fdump-tree-vect-details" } */
> +/* { dg-final { check-function-bodies "**" "" ""} } */
> +
> +#define N 512
> +#define START 1
> +#define END 505
> +
> +int x[N] __attribute__((aligned(32)));
> +
> +/*
> +** foo:
> +**    ...
> +**    ld1w    z[0-9]+.s, p[0-9]+/z, \[x[0-9], x[0-9], lsl 2\]
> +**    cmple    p[0-9]+.s, p[0-9]+/z, z[0-9]+.s, #0
> +**    ptest    p[0-9]+, p[0-9]+.b
> +**    ...
> +*/
> +
> +int __attribute__((noipa))
> +foo (void)
> +{
> +  int z = 0;
> +  for (unsigned int i = START; i < END; ++i)
> +    {
> +      z++;
> +      if (x[i] > 0)
> +        continue;
> +    
> +      return z;
> +    }
> +  return -1;
> +}
> +
> +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
> +/* { dg-final { scan-tree-dump "pfa_iv_offset" "vect" } } */
> +/* { dg-final { scan-tree-dump "Alignment of access forced using peeling" 
> "vect" } } */
> +
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pr119351_run.c 
> b/gcc/testsuite/gcc.target/aarch64/sve/pr119351_run.c
> new file mode 100644
> index 
> 0000000000000000000000000000000000000000..d36ab0eb7a900504e7dc2266ec5a19d1beeb5123
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/pr119351_run.c
> @@ -0,0 +1,20 @@
> +/* Fix for PR119351 alignment peeling with vectors and VLS.  */
> +/* { dg-do run { target aarch64_sve_hw } } */
> +/* { dg-options "-Ofast --param aarch64-autovec-preference=sve-only" } */
> +/* { dg-additional-options "-msve-vector-bits=256" { target 
> aarch64_sve256_hw } } */
> +/* { dg-additional-options "-msve-vector-bits=128" { target 
> aarch64_sve128_hw } } */
> +
> +#include "pr119351.c"
> +
> +int __attribute__ ((optimize (1)))
> +main (void)
> +{
> +  x[0] = 1;
> +  x[1] = 21;
> +  x[2] = 39;
> +  x[3] = 59;
> +  int res = foo ();
> +  if (res != 4)
> +    __builtin_abort ();
> +  return 0;
> +}
> diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> index 
> 7f874354e75e8d4d3a7196df4e9b559ac641827c..5af1973734e20b22d81e1624933d8265448c1e1a
>  100644
> --- a/gcc/tree-vect-stmts.cc
> +++ b/gcc/tree-vect-stmts.cc
> @@ -13615,29 +13615,23 @@ vectorizable_early_exit (vec_info *vinfo, 
> stmt_vec_info stmt_info,
>      codegen so we must replace the original insn.  */
>   gimple *orig_stmt = STMT_VINFO_STMT (vect_orig_stmt (stmt_info));
>   gcond *cond_stmt = as_a <gcond *>(orig_stmt);
> +
> +  tree cst = build_zero_cst (vectype);
> +  auto bb = gimple_bb (cond_stmt);
> +  edge exit_true_edge = EDGE_SUCC (bb, 0);
> +  if (exit_true_edge->flags & EDGE_FALSE_VALUE)
> +    exit_true_edge = EDGE_SUCC (bb, 1);
> +  gcc_assert (exit_true_edge->flags & EDGE_TRUE_VALUE);
> +
>   /* When vectorizing we assume that if the branch edge is taken that we're
>      exiting the loop.  This is not however always the case as the compiler 
> will
>      rewrite conditions to always be a comparison against 0.  To do this it
>      sometimes flips the edges.  This is fine for scalar,  but for vector we
> -     then have to flip the test, as we're still assuming that if you take the
> -     branch edge that we found the exit condition.  i.e. we need to know 
> whether
> -     we are generating a `forall` or an `exist` condition.  */
> -  auto new_code = NE_EXPR;
> -  auto reduc_optab = ior_optab;
> -  auto reduc_op = BIT_IOR_EXPR;
> -  tree cst = build_zero_cst (vectype);
> -  edge exit_true_edge = EDGE_SUCC (gimple_bb (cond_stmt), 0);
> -  if (exit_true_edge->flags & EDGE_FALSE_VALUE)
> -    exit_true_edge = EDGE_SUCC (gimple_bb (cond_stmt), 1);
> -  gcc_assert (exit_true_edge->flags & EDGE_TRUE_VALUE);
> -  if (flow_bb_inside_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
> -                 exit_true_edge->dest))
> -    {
> -      new_code = EQ_EXPR;
> -      reduc_optab = and_optab;
> -      reduc_op = BIT_AND_EXPR;
> -      cst = build_minus_one_cst (vectype);
> -    }
> +     then have to negate the result of the test, as we're still assuming 
> that if
> +     you take the branch edge that we found the exit condition.  i.e. we 
> need to
> +     know whether we are generating a `forall` or an `exist` condition.  */
> +  bool flipped = flow_bb_inside_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
> +                    exit_true_edge->dest);
> 
>   /* Analyze only.  */
>   if (!vec_stmt)
> @@ -13653,14 +13647,13 @@ vectorizable_early_exit (vec_info *vinfo, 
> stmt_vec_info stmt_info,
>    }
> 
>       if (ncopies > 1
> -      && direct_optab_handler (reduc_optab, mode) == CODE_FOR_nothing)
> +      && direct_optab_handler (ior_optab, mode) == CODE_FOR_nothing)
>    {
>      if (dump_enabled_p ())
>          dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
>                   "can't vectorize early exit because the "
> -                   "target does not support boolean vector %s "
> +                   "target does not support boolean vector IOR "
>                   "for type %T.\n",
> -                   reduc_optab == ior_optab ? "OR" : "AND",
>                   vectype);
>      return false;
>    }
> @@ -13720,6 +13713,29 @@ vectorizable_early_exit (vec_info *vinfo, 
> stmt_vec_info stmt_info,
>    stmts.quick_push (gimple_assign_lhs (stmt));
>     }
> 
> +  /* If we're comparing against a previous forall we need to negate the 
> resullts
> +     before we do the final comparison or reduction.  */
> +  if (flipped)
> +    {
> +      /* Rewrite the if(all(mask)) into if (!all(mask)) which is the same as
> +     if (any(~mask)) by negating the masks and flipping the branches.
> +
> +    1. For unmasked loops we simply reduce the ~mask.
> +    2. For masked loops we reduce (~mask & loop_mask) which is the same as
> +       doing (mask & loop_mask) ^ loop_mask.  */
> +      for (unsigned i = 0; i < stmts.length (); i++)
> +    {
> +      tree inv_lhs = make_temp_ssa_name (vectype, NULL, "vexit_inv");
> +      auto inv_stmt = gimple_build_assign (inv_lhs, BIT_NOT_EXPR, stmts[i]);
> +      vect_finish_stmt_generation (loop_vinfo, stmt_info, inv_stmt,
> +                       &cond_gsi);
> +      stmts[i] = inv_lhs;
> +    }
> +
> +      EDGE_SUCC (bb, 0)->flags ^= (EDGE_TRUE_VALUE|EDGE_FALSE_VALUE);
> +      EDGE_SUCC (bb, 1)->flags ^= (EDGE_TRUE_VALUE|EDGE_FALSE_VALUE);
> +    }
> +
>   /* Determine if we need to reduce the final value.  */
>   if (stmts.length () > 1)
>     {
> @@ -13758,7 +13774,7 @@ vectorizable_early_exit (vec_info *vinfo, 
> stmt_vec_info stmt_info,
>      new_temp = make_temp_ssa_name (vectype, NULL, "vexit_reduc");
>      tree arg0 = workset.pop ();
>      tree arg1 = workset.pop ();
> -      new_stmt = gimple_build_assign (new_temp, reduc_op, arg0, arg1);
> +      new_stmt = gimple_build_assign (new_temp, BIT_IOR_EXPR, arg0, arg1);
>      vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt,
>                       &cond_gsi);
>      workset.quick_insert (0, new_temp);
> @@ -13781,7 +13797,7 @@ vectorizable_early_exit (vec_info *vinfo, 
> stmt_vec_info stmt_info,
> 
>   gcc_assert (new_temp);
> 
> -  gimple_cond_set_condition (cond_stmt, new_code, new_temp, cst);
> +  gimple_cond_set_condition (cond_stmt, NE_EXPR, new_temp, cst);
>   update_stmt (orig_stmt);
> 
>   if (slp_node)
> 
> 
> --
> <rb19386.patch>

Reply via email to