Hi All, The following testcase shows an incorrect masked codegen:
#define N 512 #define START 1 #define END 505 int x[N] __attribute__((aligned(32))); int __attribute__((noipa)) foo (void) { int z = 0; for (unsigned int i = START; i < END; ++i) { z++; if (x[i] > 0) continue; return z; } return -1; } notice how there's a continue there instead of a break. This means we generate a control flow where success stays within the loop iteration: mask_patt_9.12_46 = vect__1.11_45 > { 0, 0, 0, 0 }; vec_mask_and_47 = mask_patt_9.12_46 & loop_mask_41; if (vec_mask_and_47 == { -1, -1, -1, -1 }) goto <bb 4>; [41.48%] else goto <bb 15>; [58.52%] However when loop_mask_41 is a partial mask this comparison can lead to an incorrect match. In this case the mask is: # loop_mask_41 = PHI <next_mask_63(6), { 0, -1, -1, -1 }(2)> due to peeling for alignment with masking and compiling with -msve-vector-bits=128. At codegen time we generate: ptrue p15.s, vl4 ptrue p7.b, vl1 not p7.b, p15/z, p7.b .L5: ld1w z29.s, p7/z, [x1, x0, lsl 2] cmpgt p7.s, p7/z, z29.s, #0 not p7.b, p15/z, p7.b ptest p15, p7.b b.none .L2 ...<early exit>... notice how at expand time the basic blocks are inverted and a not is generated. But the generated not is unmasked (or predicated over an ALL true mask in this case). This has the unintended side-effect of flipping the results of the inactive lanes (which were zero'd by the cmpgt) into -1. Which then incorrectly causes us to not take the branch to .L2. This is happening because the expander has no context about the mask, and since we can't mask a gcond, we do the next best thing which is to mask both operands. We already mask the compare, but this patch now also masks the constant. In the normal case this means we drop it since {0, ..} & mask = {0, ..} but in the case of an forall comparison we'll keep the mask, allowing the generated code to correctly mask the results. For the above we now generate: .L5: ld1w z28.s, p7/z, [x1, x0, lsl 2] cmpgt p14.s, p7/z, z28.s, #0 eors p7.b, p15/z, p7.b, p14.b b.none .L2 This fixes gromacs with > 1 OpenMP threads and improves performance. Bootstrapped Regtested on aarch64-none-linux-gnu, arm-none-linux-gnueabihf, x86_64-pc-linux-gnu -m32, -m64 and no issues. Ok for master? and backport to GCC-14? Thanks, Tamar gcc/ChangeLog: PR tree-optimization/119351 * tree-vect-stmts.cc (vectorizable_early_exit): Mask both operands of the gcond for partial masking support. gcc/testsuite/ChangeLog: PR tree-optimization/119351 * gcc.target/aarch64/sve/pr119351.c: New test. * gcc.target/aarch64/sve/pr119351_run.c: New test. --- diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pr119351.c b/gcc/testsuite/gcc.target/aarch64/sve/pr119351.c new file mode 100644 index 0000000000000000000000000000000000000000..198f7edb0fc01bfc74ae231db7823e9a6f0bc119 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/pr119351.c @@ -0,0 +1,38 @@ +/* Fix for PR119351 alignment peeling with vectors and VLS. */ +/* { dg-do compile } */ +/* { dg-options "-Ofast -msve-vector-bits=256 --param aarch64-autovec-preference=sve-only -fdump-tree-vect-details" } */ +/* { dg-final { check-function-bodies "**" "" ""} } */ + +#define N 512 +#define START 1 +#define END 505 + +int x[N] __attribute__((aligned(32))); + +/* +** foo: +** ... +** ld1w z[0-9]+.s, p[0-9]+/z, \[x[0-9], x[0-9], lsl 2\] +** cmpgt p[0-9]+.s, p[0-9]+/z, z[0-9]+.s, #0 +** eors p[0-9]+.b, p[0-9]+/z, p[0-9]+.b, p[0-9]+.b +** ... +*/ + +int __attribute__((noipa)) +foo (void) +{ + int z = 0; + for (unsigned int i = START; i < END; ++i) + { + z++; + if (x[i] > 0) + continue; + + return z; + } + return -1; +} + +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */ +/* { dg-final { scan-tree-dump "pfa_iv_offset" "vect" } } */ +/* { dg-final { scan-tree-dump "Alignment of access forced using peeling" "vect" } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pr119351_run.c b/gcc/testsuite/gcc.target/aarch64/sve/pr119351_run.c new file mode 100644 index 0000000000000000000000000000000000000000..d36ab0eb7a900504e7dc2266ec5a19d1beeb5123 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/pr119351_run.c @@ -0,0 +1,20 @@ +/* Fix for PR119351 alignment peeling with vectors and VLS. */ +/* { dg-do run { target aarch64_sve_hw } } */ +/* { dg-options "-Ofast --param aarch64-autovec-preference=sve-only" } */ +/* { dg-additional-options "-msve-vector-bits=256" { target aarch64_sve256_hw } } */ +/* { dg-additional-options "-msve-vector-bits=128" { target aarch64_sve128_hw } } */ + +#include "pr119351.c" + +int __attribute__ ((optimize (1))) +main (void) +{ + x[0] = 1; + x[1] = 21; + x[2] = 39; + x[3] = 59; + int res = foo (); + if (res != 4) + __builtin_abort (); + return 0; +} diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc index 7f874354e75e8d4d3a7196df4e9b559ac641827c..8f67483a943705d89e3f6d744cb2bdb5cf8b44db 100644 --- a/gcc/tree-vect-stmts.cc +++ b/gcc/tree-vect-stmts.cc @@ -13781,6 +13781,20 @@ vectorizable_early_exit (vec_info *vinfo, stmt_vec_info stmt_info, gcc_assert (new_temp); + /* We have to mask both operands of the gcond because if one element of the + mask is inactive and was zero'd we would incorrectly compare true for the + inactive element. */ + if (masked_loop_p) + { + tree mask + = vect_get_loop_mask (loop_vinfo, gsi, masks, ncopies, vectype, 0); + cst + = prepare_vec_mask (loop_vinfo, TREE_TYPE (mask), mask, cst, &cond_gsi); + } + else if (len_loop_p) + cst = vect_gen_loop_len_mask (loop_vinfo, gsi, &cond_gsi, lens, + ncopies, vectype, cst, 0, 1); + gimple_cond_set_condition (cond_stmt, new_code, new_temp, cst); update_stmt (orig_stmt); --
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pr119351.c b/gcc/testsuite/gcc.target/aarch64/sve/pr119351.c new file mode 100644 index 0000000000000000000000000000000000000000..198f7edb0fc01bfc74ae231db7823e9a6f0bc119 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/pr119351.c @@ -0,0 +1,38 @@ +/* Fix for PR119351 alignment peeling with vectors and VLS. */ +/* { dg-do compile } */ +/* { dg-options "-Ofast -msve-vector-bits=256 --param aarch64-autovec-preference=sve-only -fdump-tree-vect-details" } */ +/* { dg-final { check-function-bodies "**" "" ""} } */ + +#define N 512 +#define START 1 +#define END 505 + +int x[N] __attribute__((aligned(32))); + +/* +** foo: +** ... +** ld1w z[0-9]+.s, p[0-9]+/z, \[x[0-9], x[0-9], lsl 2\] +** cmpgt p[0-9]+.s, p[0-9]+/z, z[0-9]+.s, #0 +** eors p[0-9]+.b, p[0-9]+/z, p[0-9]+.b, p[0-9]+.b +** ... +*/ + +int __attribute__((noipa)) +foo (void) +{ + int z = 0; + for (unsigned int i = START; i < END; ++i) + { + z++; + if (x[i] > 0) + continue; + + return z; + } + return -1; +} + +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */ +/* { dg-final { scan-tree-dump "pfa_iv_offset" "vect" } } */ +/* { dg-final { scan-tree-dump "Alignment of access forced using peeling" "vect" } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pr119351_run.c b/gcc/testsuite/gcc.target/aarch64/sve/pr119351_run.c new file mode 100644 index 0000000000000000000000000000000000000000..d36ab0eb7a900504e7dc2266ec5a19d1beeb5123 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/pr119351_run.c @@ -0,0 +1,20 @@ +/* Fix for PR119351 alignment peeling with vectors and VLS. */ +/* { dg-do run { target aarch64_sve_hw } } */ +/* { dg-options "-Ofast --param aarch64-autovec-preference=sve-only" } */ +/* { dg-additional-options "-msve-vector-bits=256" { target aarch64_sve256_hw } } */ +/* { dg-additional-options "-msve-vector-bits=128" { target aarch64_sve128_hw } } */ + +#include "pr119351.c" + +int __attribute__ ((optimize (1))) +main (void) +{ + x[0] = 1; + x[1] = 21; + x[2] = 39; + x[3] = 59; + int res = foo (); + if (res != 4) + __builtin_abort (); + return 0; +} diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc index 7f874354e75e8d4d3a7196df4e9b559ac641827c..8f67483a943705d89e3f6d744cb2bdb5cf8b44db 100644 --- a/gcc/tree-vect-stmts.cc +++ b/gcc/tree-vect-stmts.cc @@ -13781,6 +13781,20 @@ vectorizable_early_exit (vec_info *vinfo, stmt_vec_info stmt_info, gcc_assert (new_temp); + /* We have to mask both operands of the gcond because if one element of the + mask is inactive and was zero'd we would incorrectly compare true for the + inactive element. */ + if (masked_loop_p) + { + tree mask + = vect_get_loop_mask (loop_vinfo, gsi, masks, ncopies, vectype, 0); + cst + = prepare_vec_mask (loop_vinfo, TREE_TYPE (mask), mask, cst, &cond_gsi); + } + else if (len_loop_p) + cst = vect_gen_loop_len_mask (loop_vinfo, gsi, &cond_gsi, lens, + ncopies, vectype, cst, 0, 1); + gimple_cond_set_condition (cond_stmt, new_code, new_temp, cst); update_stmt (orig_stmt);