https://gcc.gnu.org/g:974c04dc2cb7f44705a9fd62b3b9592d7c6faca3
commit r16-6511-g974c04dc2cb7f44705a9fd62b3b9592d7c6faca3 Author: Tamar Christina <[email protected]> Date: Mon Jan 5 20:56:03 2026 +0000 vect: teach vectorizable_call to predicate calls when they can trap [PR122103] The following example void f (float *__restrict c, int *__restrict d, int n) { for (int i = 0; i < n; i++) { c[i] = __builtin_sqrtf (c[i]); } } compiled with -O3 -march=armv9-a -fno-math-errno -ftrapping-math needs to be predicated on the conditional. It's invalid to execute the branch and use a select to extract it later unless using -fno-trapping-math. We currently generate: f: cmp w2, 0 ble .L1 mov x1, 0 whilelo p7.s, wzr, w2 ptrue p6.b, all .L3: ld1w z31.s, p7/z, [x0, x1, lsl 2] fsqrt z31.s, p6/m, z31.s st1w z31.s, p7, [x0, x1, lsl 2] incw x1 whilelo p7.s, w1, w2 b.any .L3 .L1: ret Which means the inactive lanes of the operation can raise an FE. With this change we now generate f: cmp w2, 0 ble .L1 mov x1, 0 whilelo p7.s, wzr, w2 .p2align 5,,15 .L3: ld1w z31.s, p7/z, [x0, x1, lsl 2] fsqrt z31.s, p7/m, z31.s st1w z31.s, p7, [x0, x1, lsl 2] incw x1 whilelo p7.s, w1, w2 b.any .L3 .L1: ret However as discussed in PR96373 while we probably shouldn't vectorize for the cases where we can trap but don't support conditional operation there doesn't seem to be a clear consensus on how GCC should handle trapping math. As such similar to PR96373 I don't stop vectorization if trapping math and the conditional operation isn't supported. gcc/ChangeLog: PR tree-optimization/122103 * tree-vect-stmts.cc (vectorizable_call): Handle trapping math. gcc/testsuite/ChangeLog: PR tree-optimization/122103 * gcc.target/aarch64/sve/pr122103_4.c: New test. * gcc.target/aarch64/sve/pr122103_5.c: New test. * gcc.target/aarch64/sve/pr122103_6.c: New test. Diff: --- gcc/testsuite/gcc.target/aarch64/sve/pr122103_4.c | 78 +++++++++++++++++++++++ gcc/testsuite/gcc.target/aarch64/sve/pr122103_5.c | 24 +++++++ gcc/testsuite/gcc.target/aarch64/sve/pr122103_6.c | 23 +++++++ gcc/tree-vect-stmts.cc | 52 ++++++++++++--- 4 files changed, 169 insertions(+), 8 deletions(-) diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pr122103_4.c b/gcc/testsuite/gcc.target/aarch64/sve/pr122103_4.c new file mode 100644 index 000000000000..cac485b5dade --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/pr122103_4.c @@ -0,0 +1,78 @@ +/* { dg-do run } */ +/* { dg-require-effective-target glibc } */ +/* { dg-options "-O3 -fno-math-errno -ftrapping-math -march=armv8-a+sve" } */ + +#define _GNU_SOURCE +#include <stdio.h> +#include <stdlib.h> +#include <stdint.h> +#include <math.h> +#include <fenv.h> +#include <signal.h> + +#pragma STDC FENV_ACCESS ON + +__attribute__((noinline)) +void f(float *__restrict c, int n) +{ + for (int i = 0; i < n; i++) + c[i] = __builtin_sqrtf (c[i] - 2.0f); +} + +static void on_fpe (int sig) +{ + (void) sig; + puts ("SIGFPE: trapped FP exception (unexpected invalid from sqrt)"); + fflush (stdout); + __builtin_abort (); +} + +int +main (void) +{ + signal (SIGFPE, on_fpe); + + /* Clear flags and enable trap on invalid operations. */ + feclearexcept (FE_ALL_EXCEPT); + feenableexcept (FE_INVALID); + + /* Choose a length that is NOT a multiple of typical SVE VL (unknown at + runtime), and includes plenty of extra lanes. */ + const int n = 37; + + float *c = aligned_alloc (64, (size_t) n * sizeof (float)); + if (!c) + return 1; + + /* Populate c so that (c[i] - 2) is a perfect square; this avoids FE_INVALID + while giving deterministic results. */ + for (int i = 0; i < n; i++) + { + int t = i & 3; + c[i] = (float) (t * t) + 2.0f; + } + + f (c, n); + + /* Only FE_INVALID would indicate a wrong extra-lane computation here. */ + if (fetestexcept (FE_INVALID)) + { + puts ("Unexpected FE_INVALID"); + return 2; + } + + int ok = 1; + for (int i = 0; i < n; i++) + { + float expected = (float) (i & 3); + if (!(c[i] == expected)) + { + printf ("Mismatch at %d: expected %g, got %g\n", i, expected, c[i]); + ok = 0; + } + } + + puts (ok ? "OK" : "FAIL"); + free (c); + return ok ? 0 : 3; +} diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pr122103_5.c b/gcc/testsuite/gcc.target/aarch64/sve/pr122103_5.c new file mode 100644 index 000000000000..ca1bfb737b8b --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/pr122103_5.c @@ -0,0 +1,24 @@ +/* { dg-do compile } */ +/* { dg-options "-O3 -fno-math-errno -ftrapping-math -march=armv9-a" } */ +/* { dg-final { check-function-bodies "**" "" "" } } */ + +/* +** f: +** ... +** whilelo p([0-9]+).s, wzr, w[0-9]+ +** ... +** ld1w z[0-9]+.s, p\1/z, \[x[0-9]+, x[0-9]+, lsl 2\] +** fadd z[0-9]+.s, p\1/m, z[0-9]+.s, z[0-9]+.s +** fsqrt z[0-9]+.s, p\1/m, z[0-9]+.s +** st1w z[0-9]+.s, p\1, \[x[0-9]+, x[0-9]+, lsl 2\] +** incw x[0-9]+ +** whilelo p\1.s, w[0-9]+, w[0-9]+ +** ... +*/ +void +f (float *__restrict c, int n) +{ + for (int i = 0; i < n; i++) + c[i] = __builtin_sqrtf (c[i] - 2.0f); +} + diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pr122103_6.c b/gcc/testsuite/gcc.target/aarch64/sve/pr122103_6.c new file mode 100644 index 000000000000..9c51121c7bf0 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/pr122103_6.c @@ -0,0 +1,23 @@ +/* { dg-do compile } */ +/* { dg-options "-O3 -fno-math-errno -fno-trapping-math -march=armv9-a" } */ +/* { dg-final { check-function-bodies "**" "" "" } } */ + +/* +** f: +** ... +** whilelo p([0-9]+).s, wzr, w[0-9]+ +** ... +** ld1w z[0-9]+.s, p\1/z, \[x[0-9]+, x[0-9]+, lsl 2\] +** fadd z[0-9]+.s, z[0-9]+.s, z[0-9]+.s +** fsqrt z[0-9]+.s, p[0-9]+/m, z[0-9]+.s +** st1w z[0-9]+.s, p\1, \[x[0-9]+, x[0-9]+, lsl 2\] +** incw x[0-9]+ +** whilelo p\1.s, w[0-9]+, w[0-9]+ +** ... +*/ +void +f (float *__restrict c, int n) +{ + for (int i = 0; i < n; i++) + c[i] = __builtin_sqrtf (c[i] - 2.0f); +} diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc index ed2486451ca0..8086d4c35c9a 100644 --- a/gcc/tree-vect-stmts.cc +++ b/gcc/tree-vect-stmts.cc @@ -3438,9 +3438,9 @@ vectorizable_call (vec_info *vinfo, loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo); bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo); tree fndecl, new_temp, rhs_type; - enum vect_def_type dt[4] + enum vect_def_type dt[5] = { vect_unknown_def_type, vect_unknown_def_type, vect_unknown_def_type, - vect_unknown_def_type }; + vect_unknown_def_type, vect_unknown_def_type }; tree vectypes[ARRAY_SIZE (dt)] = {}; slp_tree slp_op[ARRAY_SIZE (dt)] = {}; auto_vec<tree, 8> vargs; @@ -3481,8 +3481,8 @@ vectorizable_call (vec_info *vinfo, /* Bail out if the function has more than four arguments, we do not have interesting builtin functions to vectorize with more than two arguments - except for fma. No arguments is also not good. */ - if (nargs == 0 || nargs > 4) + except for fma (cond_fma has more). No arguments is also not good. */ + if (nargs == 0 || nargs > 5) return false; /* Ignore the arguments of IFN_GOMP_SIMD_LANE, they are magic. */ @@ -3625,6 +3625,33 @@ vectorizable_call (vec_info *vinfo, ifn = vectorizable_internal_function (cfn, callee, vectype_out, vectype_in); + /* Check if the operation traps. */ + bool could_trap = gimple_could_trap_p (STMT_VINFO_STMT (stmt_info)); + if (could_trap && cost_vec && loop_vinfo) + { + /* If the operation can trap it must be conditional, otherwise fail. */ + internal_fn cond_fn = get_conditional_internal_fn (ifn); + internal_fn cond_len_fn = get_len_internal_fn (ifn); + if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)) + { + /* We assume that BB SLP fills all lanes, so no inactive lanes can + cause issues. */ + if ((cond_fn == IFN_LAST + || !direct_internal_fn_supported_p (cond_fn, vectype_out, + OPTIMIZE_FOR_SPEED)) + && (cond_len_fn == IFN_LAST + || !direct_internal_fn_supported_p (cond_len_fn, vectype_out, + OPTIMIZE_FOR_SPEED))) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "can't use a fully-masked loop because no" + " conditional operation is available.\n"); + LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false; + } + } + } + /* If that fails, try asking for a target-specific built-in function. */ if (ifn == IFN_LAST) { @@ -3749,7 +3776,7 @@ vectorizable_call (vec_info *vinfo, else if (reduc_idx >= 0) gcc_unreachable (); } - else if (masked_loop_p && mask_opno == -1 && reduc_idx >= 0) + else if (masked_loop_p && mask_opno == -1 && (reduc_idx >= 0 || could_trap)) { ifn = cond_fn; vect_nargs += 2; @@ -3793,7 +3820,8 @@ vectorizable_call (vec_info *vinfo, { int varg = 0; /* Add the mask if necessary. */ - if (masked_loop_p && mask_opno == -1 && reduc_idx >= 0) + if (masked_loop_p && mask_opno == -1 + && (reduc_idx >= 0 || could_trap)) { gcc_assert (internal_fn_mask_index (ifn) == varg); unsigned int vec_num = vec_oprnds0.length (); @@ -3807,10 +3835,18 @@ vectorizable_call (vec_info *vinfo, vargs[varg++] = vec_oprndsk[i]; } /* Add the else value if necessary. */ - if (masked_loop_p && mask_opno == -1 && reduc_idx >= 0) + if (masked_loop_p && mask_opno == -1 + && (reduc_idx >= 0 || could_trap)) { gcc_assert (internal_fn_else_index (ifn) == varg); - vargs[varg++] = vargs[reduc_idx + 1]; + if (reduc_idx >= 0) + vargs[varg++] = vargs[reduc_idx + 1]; + else + { + auto else_value = targetm.preferred_else_value + (cond_fn, vectype_out, varg - 1, &vargs[1]); + vargs[varg++] = else_value; + } } if (clz_ctz_arg1) vargs[varg++] = clz_ctz_arg1;
