On Wed, 16 Apr 2025, Tamar Christina wrote: > Hi All, > > The following testcase shows an incorrect masked codegen: > > #define N 512 > #define START 1 > #define END 505 > > int x[N] __attribute__((aligned(32))); > > int __attribute__((noipa)) > foo (void) > { > int z = 0; > for (unsigned int i = START; i < END; ++i) > { > z++; > if (x[i] > 0) > continue; > > return z; > } > return -1; > } > > notice how there's a continue there instead of a break. This means we > generate > a control flow where success stays within the loop iteration:
I think that's just a fancy way of rotating the loop. So the key issue is that when the exit test is "inverted", aka it stays in the loop when true and exits when false, we do if (flow_bb_inside_loop_p (LOOP_VINFO_LOOP (loop_vinfo), exit_true_edge->dest)) { new_code = EQ_EXPR; reduc_optab = and_optab; reduc_op = BIT_AND_EXPR; cst = build_minus_one_cst (vectype); and with PFA with mask and the initial loop mask of { 0, 0, -1, -1 } we then exit early and the scalar loop does not correctly handle this case (apart from it being a missed optimization). For the regular non-inverted case we use auto new_code = NE_EXPR; auto reduc_optab = ior_optab; auto reduc_op = BIT_IOR_EXPR; tree cst = build_zero_cst (vectype); and that is fine. > > mask_patt_9.12_46 = vect__1.11_45 > { 0, 0, 0, 0 }; > vec_mask_and_47 = mask_patt_9.12_46 & loop_mask_41; > if (vec_mask_and_47 == { -1, -1, -1, -1 }) > goto <bb 4>; [41.48%] > else > goto <bb 15>; [58.52%] > > However when loop_mask_41 is a partial mask this comparison can lead to an > incorrect match. In this case the mask is: > > # loop_mask_41 = PHI <next_mask_63(6), { 0, -1, -1, -1 }(2)> > > due to peeling for alignment with masking and compiling with > -msve-vector-bits=128. > > At codegen time we generate: > > ptrue p15.s, vl4 > ptrue p7.b, vl1 > not p7.b, p15/z, p7.b > .L5: > ld1w z29.s, p7/z, [x1, x0, lsl 2] > cmpgt p7.s, p7/z, z29.s, #0 > not p7.b, p15/z, p7.b > ptest p15, p7.b > b.none .L2 > ...<early exit>... > > notice how at expand time the basic blocks are inverted and a not is > generated. > But the generated not is unmasked (or predicated over an ALL true mask in this > case). This has the unintended side-effect of flipping the results of the > inactive lanes (which were zero'd by the cmpgt) into -1. Which then > incorrectly > causes us to not take the branch to .L2. > > This is happening because the expander has no context about the mask, and > since > we can't mask a gcond, we do the next best thing which is to mask both > operands. So you make this sound as if it were a bug in the expander because "it doesn't know"? I think a compare against {-1,...} is flawed, this case needs to compare against loop_mask, not all-ones, no? So instead of > vec_mask_and_47 = mask_patt_9.12_46 & loop_mask_41; > if (vec_mask_and_47 == { -1, -1, -1, -1 }) do > vec_mask_and_47 = mask_patt_9.12_46 & loop_mask_41; > if (vec_mask_and_47 == loop_mask_41) which is sort-of what you do, of course, just in an odd way (IMO). Richard. > We already mask the compare, but this patch now also masks the constant. In > the > normal case this means we drop it since {0, ..} & mask = {0, ..} but in the > case > of an forall comparison we'll keep the mask, allowing the generated code to > correctly mask the results. > > For the above we now generate: > > .L5: > ld1w z28.s, p7/z, [x1, x0, lsl 2] > cmpgt p14.s, p7/z, z28.s, #0 > eors p7.b, p15/z, p7.b, p14.b > b.none .L2 > > This fixes gromacs with > 1 OpenMP threads and improves performance. > > Bootstrapped Regtested on aarch64-none-linux-gnu, > arm-none-linux-gnueabihf, x86_64-pc-linux-gnu > -m32, -m64 and no issues. > > Ok for master? and backport to GCC-14? > > Thanks, > Tamar > > > gcc/ChangeLog: > > PR tree-optimization/119351 > * tree-vect-stmts.cc (vectorizable_early_exit): Mask both operands of > the gcond for partial masking support. > > gcc/testsuite/ChangeLog: > > PR tree-optimization/119351 > * gcc.target/aarch64/sve/pr119351.c: New test. > * gcc.target/aarch64/sve/pr119351_run.c: New test. > > --- > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pr119351.c > b/gcc/testsuite/gcc.target/aarch64/sve/pr119351.c > new file mode 100644 > index > 0000000000000000000000000000000000000000..198f7edb0fc01bfc74ae231db7823e9a6f0bc119 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/sve/pr119351.c > @@ -0,0 +1,38 @@ > +/* Fix for PR119351 alignment peeling with vectors and VLS. */ > +/* { dg-do compile } */ > +/* { dg-options "-Ofast -msve-vector-bits=256 --param > aarch64-autovec-preference=sve-only -fdump-tree-vect-details" } */ > +/* { dg-final { check-function-bodies "**" "" ""} } */ > + > +#define N 512 > +#define START 1 > +#define END 505 > + > +int x[N] __attribute__((aligned(32))); > + > +/* > +** foo: > +** ... > +** ld1w z[0-9]+.s, p[0-9]+/z, \[x[0-9], x[0-9], lsl 2\] > +** cmpgt p[0-9]+.s, p[0-9]+/z, z[0-9]+.s, #0 > +** eors p[0-9]+.b, p[0-9]+/z, p[0-9]+.b, p[0-9]+.b > +** ... > +*/ > + > +int __attribute__((noipa)) > +foo (void) > +{ > + int z = 0; > + for (unsigned int i = START; i < END; ++i) > + { > + z++; > + if (x[i] > 0) > + continue; > + > + return z; > + } > + return -1; > +} > + > +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */ > +/* { dg-final { scan-tree-dump "pfa_iv_offset" "vect" } } */ > +/* { dg-final { scan-tree-dump "Alignment of access forced using peeling" > "vect" } } */ > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pr119351_run.c > b/gcc/testsuite/gcc.target/aarch64/sve/pr119351_run.c > new file mode 100644 > index > 0000000000000000000000000000000000000000..d36ab0eb7a900504e7dc2266ec5a19d1beeb5123 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/sve/pr119351_run.c > @@ -0,0 +1,20 @@ > +/* Fix for PR119351 alignment peeling with vectors and VLS. */ > +/* { dg-do run { target aarch64_sve_hw } } */ > +/* { dg-options "-Ofast --param aarch64-autovec-preference=sve-only" } */ > +/* { dg-additional-options "-msve-vector-bits=256" { target > aarch64_sve256_hw } } */ > +/* { dg-additional-options "-msve-vector-bits=128" { target > aarch64_sve128_hw } } */ > + > +#include "pr119351.c" > + > +int __attribute__ ((optimize (1))) > +main (void) > +{ > + x[0] = 1; > + x[1] = 21; > + x[2] = 39; > + x[3] = 59; > + int res = foo (); > + if (res != 4) > + __builtin_abort (); > + return 0; > +} > diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc > index > 7f874354e75e8d4d3a7196df4e9b559ac641827c..8f67483a943705d89e3f6d744cb2bdb5cf8b44db > 100644 > --- a/gcc/tree-vect-stmts.cc > +++ b/gcc/tree-vect-stmts.cc > @@ -13781,6 +13781,20 @@ vectorizable_early_exit (vec_info *vinfo, > stmt_vec_info stmt_info, > > gcc_assert (new_temp); > > + /* We have to mask both operands of the gcond because if one element of the > + mask is inactive and was zero'd we would incorrectly compare true for > the > + inactive element. */ > + if (masked_loop_p) > + { > + tree mask > + = vect_get_loop_mask (loop_vinfo, gsi, masks, ncopies, vectype, 0); > + cst > + = prepare_vec_mask (loop_vinfo, TREE_TYPE (mask), mask, cst, &cond_gsi); > + } > + else if (len_loop_p) > + cst = vect_gen_loop_len_mask (loop_vinfo, gsi, &cond_gsi, lens, > + ncopies, vectype, cst, 0, 1); > + > gimple_cond_set_condition (cond_stmt, new_code, new_temp, cst); > update_stmt (orig_stmt); > > > > -- Richard Biener <rguent...@suse.de> SUSE Software Solutions Germany GmbH, Frankenstrasse 146, 90461 Nuernberg, Germany; GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)