On Wed, 16 Apr 2025, Tamar Christina wrote:

> Hi All,
> 
> The following testcase shows an incorrect masked codegen:
> 
> #define N 512
> #define START 1
> #define END 505
>  
> int x[N] __attribute__((aligned(32)));
> 
> int __attribute__((noipa))
> foo (void)
> {
>   int z = 0;
>   for (unsigned int i = START; i < END; ++i)
>     {
>       z++;
>       if (x[i] > 0)
>         continue;
>     
>       return z;
>     }
>   return -1;
> }
> 
> notice how there's a continue there instead of a break.  This means we 
> generate
> a control flow where success stays within the loop iteration:

I think that's just a fancy way of rotating the loop.

So the key issue is that when the exit test is "inverted", aka it
stays in the loop when true and exits when false, we do

  if (flow_bb_inside_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
                             exit_true_edge->dest))
    {
      new_code = EQ_EXPR;
      reduc_optab = and_optab;
      reduc_op = BIT_AND_EXPR;
      cst = build_minus_one_cst (vectype);

and with PFA with mask and the initial loop mask of { 0, 0, -1, -1 }
we then exit early and the scalar loop does not correctly handle
this case (apart from it being a missed optimization).  For the
regular non-inverted case we use

  auto new_code = NE_EXPR;
  auto reduc_optab = ior_optab;
  auto reduc_op = BIT_IOR_EXPR;
  tree cst = build_zero_cst (vectype);

and that is fine.

> 
>   mask_patt_9.12_46 = vect__1.11_45 > { 0, 0, 0, 0 };
>   vec_mask_and_47 = mask_patt_9.12_46 & loop_mask_41;
>   if (vec_mask_and_47 == { -1, -1, -1, -1 })
>     goto <bb 4>; [41.48%]
>   else
>     goto <bb 15>; [58.52%]
> 
> However when loop_mask_41 is a partial mask this comparison can lead to an
> incorrect match.  In this case the mask is:
> 
>   # loop_mask_41 = PHI <next_mask_63(6), { 0, -1, -1, -1 }(2)>
> 
> due to peeling for alignment with masking and compiling with
> -msve-vector-bits=128.
> 
> At codegen time we generate:
> 
>       ptrue   p15.s, vl4
>       ptrue   p7.b, vl1
>       not     p7.b, p15/z, p7.b
> .L5:
>       ld1w    z29.s, p7/z, [x1, x0, lsl 2]
>       cmpgt   p7.s, p7/z, z29.s, #0
>       not     p7.b, p15/z, p7.b
>       ptest   p15, p7.b
>       b.none  .L2
>       ...<early exit>...
> 
> notice how at expand time the basic blocks are inverted and a not is 
> generated.
> But the generated not is unmasked (or predicated over an ALL true mask in this
> case).  This has the unintended side-effect of flipping the results of the
> inactive lanes (which were zero'd by the cmpgt) into -1.  Which then 
> incorrectly
> causes us to not take the branch to .L2.
> 
> This is happening because the expander has no context about the mask, and 
> since
> we can't mask a gcond, we do the next best thing which is to mask both 
> operands.

So you make this sound as if it were a bug in the expander because
"it doesn't know"?  I think a compare against {-1,...} is flawed,
this case needs to compare against loop_mask, not all-ones, no?

So instead of

>   vec_mask_and_47 = mask_patt_9.12_46 & loop_mask_41;
>   if (vec_mask_and_47 == { -1, -1, -1, -1 })

do

>   vec_mask_and_47 = mask_patt_9.12_46 & loop_mask_41;
>   if (vec_mask_and_47 == loop_mask_41)

which is sort-of what you do, of course, just in an odd way (IMO).

Richard.

> We already mask the compare, but this patch now also masks the constant.  In 
> the
> normal case this means we drop it since {0, ..} & mask = {0, ..} but in the 
> case
> of an forall comparison we'll keep the mask, allowing the generated code to
> correctly mask the results.
> 
> For the above we now generate:
> 
> .L5:
>         ld1w    z28.s, p7/z, [x1, x0, lsl 2]
>         cmpgt   p14.s, p7/z, z28.s, #0
>         eors    p7.b, p15/z, p7.b, p14.b
>         b.none  .L2
> 
> This fixes gromacs with > 1 OpenMP threads and improves performance.
> 
> Bootstrapped Regtested on aarch64-none-linux-gnu,
> arm-none-linux-gnueabihf, x86_64-pc-linux-gnu
> -m32, -m64 and no issues.
> 
> Ok for master? and backport to GCC-14?
> 
> Thanks,
> Tamar
> 
> 
> gcc/ChangeLog:
> 
>       PR tree-optimization/119351
>       * tree-vect-stmts.cc (vectorizable_early_exit): Mask both operands of
>       the gcond for partial masking support.
> 
> gcc/testsuite/ChangeLog:
> 
>       PR tree-optimization/119351
>       * gcc.target/aarch64/sve/pr119351.c: New test.
>       * gcc.target/aarch64/sve/pr119351_run.c: New test.
> 
> ---
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pr119351.c 
> b/gcc/testsuite/gcc.target/aarch64/sve/pr119351.c
> new file mode 100644
> index 
> 0000000000000000000000000000000000000000..198f7edb0fc01bfc74ae231db7823e9a6f0bc119
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/pr119351.c
> @@ -0,0 +1,38 @@
> +/* Fix for PR119351 alignment peeling with vectors and VLS.  */
> +/* { dg-do compile } */
> +/* { dg-options "-Ofast -msve-vector-bits=256 --param 
> aarch64-autovec-preference=sve-only -fdump-tree-vect-details" } */
> +/* { dg-final { check-function-bodies "**" "" ""} } */
> +
> +#define N 512
> +#define START 1
> +#define END 505
> + 
> +int x[N] __attribute__((aligned(32)));
> +
> +/*
> +** foo:
> +**   ...
> +**   ld1w    z[0-9]+.s, p[0-9]+/z, \[x[0-9], x[0-9], lsl 2\]
> +**   cmpgt   p[0-9]+.s, p[0-9]+/z, z[0-9]+.s, #0
> +**   eors    p[0-9]+.b, p[0-9]+/z, p[0-9]+.b, p[0-9]+.b
> +**   ...
> +*/
> +
> +int __attribute__((noipa))
> +foo (void)
> +{
> +  int z = 0;
> +  for (unsigned int i = START; i < END; ++i)
> +    {
> +      z++;
> +      if (x[i] > 0)
> +        continue;
> +    
> +      return z;
> +    }
> +  return -1;
> +}
> +
> +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
> +/* { dg-final { scan-tree-dump "pfa_iv_offset" "vect" } } */
> +/* { dg-final { scan-tree-dump "Alignment of access forced using peeling" 
> "vect" } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pr119351_run.c 
> b/gcc/testsuite/gcc.target/aarch64/sve/pr119351_run.c
> new file mode 100644
> index 
> 0000000000000000000000000000000000000000..d36ab0eb7a900504e7dc2266ec5a19d1beeb5123
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/pr119351_run.c
> @@ -0,0 +1,20 @@
> +/* Fix for PR119351 alignment peeling with vectors and VLS.  */
> +/* { dg-do run { target aarch64_sve_hw } } */
> +/* { dg-options "-Ofast --param aarch64-autovec-preference=sve-only" } */
> +/* { dg-additional-options "-msve-vector-bits=256" { target 
> aarch64_sve256_hw } } */
> +/* { dg-additional-options "-msve-vector-bits=128" { target 
> aarch64_sve128_hw } } */
> +
> +#include "pr119351.c"
> +
> +int __attribute__ ((optimize (1)))
> +main (void)
> +{
> +  x[0] = 1;
> +  x[1] = 21;
> +  x[2] = 39;
> +  x[3] = 59;
> +  int res = foo ();
> +  if (res != 4)
> +    __builtin_abort ();
> +  return 0;
> +}
> diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> index 
> 7f874354e75e8d4d3a7196df4e9b559ac641827c..8f67483a943705d89e3f6d744cb2bdb5cf8b44db
>  100644
> --- a/gcc/tree-vect-stmts.cc
> +++ b/gcc/tree-vect-stmts.cc
> @@ -13781,6 +13781,20 @@ vectorizable_early_exit (vec_info *vinfo, 
> stmt_vec_info stmt_info,
>  
>    gcc_assert (new_temp);
>  
> +  /* We have to mask both operands of the gcond because if one element of the
> +     mask is inactive and was zero'd we would incorrectly compare true for 
> the
> +     inactive element.  */
> +  if (masked_loop_p)
> +    {
> +      tree mask
> +     = vect_get_loop_mask (loop_vinfo, gsi, masks, ncopies, vectype, 0);
> +      cst
> +     = prepare_vec_mask (loop_vinfo, TREE_TYPE (mask), mask, cst, &cond_gsi);
> +    }
> +  else if (len_loop_p)
> +    cst = vect_gen_loop_len_mask (loop_vinfo, gsi, &cond_gsi, lens,
> +                               ncopies, vectype, cst, 0, 1);
> +
>    gimple_cond_set_condition (cond_stmt, new_code, new_temp, cst);
>    update_stmt (orig_stmt);
>  
> 
> 
> 

-- 
Richard Biener <rguent...@suse.de>
SUSE Software Solutions Germany GmbH,
Frankenstrasse 146, 90461 Nuernberg, Germany;
GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)

Reply via email to