https://gcc.gnu.org/g:7fcd3ed36c68d39b1d51137d5bdf0bd91b99be60
commit r16-6510-g7fcd3ed36c68d39b1d51137d5bdf0bd91b99be60 Author: Tamar Christina <[email protected]> Date: Mon Jan 5 20:55:34 2026 +0000 vect: teach if-convert to predicate __builtin calls [PR122103] The following testcase void f (float *__restrict c, int *__restrict d, int n) { for (int i = 0; i < n; i++) { if (d[i] > 1000) c[i] = __builtin_sqrtf (c[i]); } } compiled with -O3 -march=armv9-a -fno-math-errno -ftrapping-math needs to be predicated on the conditional. It's invalid to execute the branch and use a select to extract it later unless using -fno-trapping-math. This change in if-conversion changes what we used to generate: _26 = _4 > 1000; _34 = _33 + _2; _5 = (float *) _34; _6 = .MASK_LOAD (_5, 32B, _26, 0.0); _7 = __builtin_sqrtf (_6); .MASK_STORE (_5, 32B, _26, _7); into _26 = _4 > 1000; _34 = _33 + _2; _5 = (float *) _34; _6 = .MASK_LOAD (_5, 32B, _26, 0.0); _7 = .COND_SQRT (_26, _6, _6); .MASK_STORE (_5, 32B, _26, _7); which correctly results in .L3: ld1w z0.s, p7/z, [x1, x3, lsl 2] cmpgt p7.s, p7/z, z0.s, z31.s ld1w z30.s, p7/z, [x0, x3, lsl 2] fsqrt z30.s, p7/m, z30.s st1w z30.s, p7, [x0, x3, lsl 2] incw x3 whilelo p7.s, w3, w2 b.any .L3 instead of .L3: ld1w z0.s, p7/z, [x1, x3, lsl 2] cmpgt p7.s, p7/z, z0.s, z31.s ld1w z30.s, p7/z, [x0, x3, lsl 2] fsqrt z30.s, p6/m, z30.s st1w z30.s, p7, [x0, x3, lsl 2] incw x3 whilelo p7.s, w3, w2 b.any .L3 gcc/ChangeLog: PR tree-optimization/122103 * tree-if-conv.cc (ifcvt_can_predicate): Support gimple_call_builtin_p. (if_convertible_stmt_p, predicate_rhs_code, predicate_statements): Likewise. gcc/testsuite/ChangeLog: PR tree-optimization/122103 * gcc.target/aarch64/sve/pr122103_1.c: New test. * gcc.target/aarch64/sve/pr122103_2.c: New test. * gcc.target/aarch64/sve/pr122103_3.c: New test. Diff: --- gcc/testsuite/gcc.target/aarch64/sve/pr122103_1.c | 93 +++++++++++++++++++++++ gcc/testsuite/gcc.target/aarch64/sve/pr122103_2.c | 27 +++++++ gcc/testsuite/gcc.target/aarch64/sve/pr122103_3.c | 27 +++++++ gcc/tree-if-conv.cc | 83 ++++++++++++++++---- 4 files changed, 217 insertions(+), 13 deletions(-) diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pr122103_1.c b/gcc/testsuite/gcc.target/aarch64/sve/pr122103_1.c new file mode 100644 index 000000000000..f06f8611393d --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/pr122103_1.c @@ -0,0 +1,93 @@ +/* { dg-do run } */ +/* { dg-require-effective-target glibc } */ +/* { dg-options "-O3 -fno-math-errno -ftrapping-math -march=armv8-a+sve" } */ + +#define _GNU_SOURCE +#include <stdio.h> +#include <stdlib.h> +#include <stdint.h> +#include <math.h> +#include <fenv.h> +#include <signal.h> + +#pragma STDC FENV_ACCESS ON + +__attribute__((noinline)) +void f(float *__restrict c, int *__restrict d, int n) +{ + for (int i = 0; i < n; i++) { + if (d[i] > 1000) + c[i] = __builtin_sqrtf(c[i]); + } +} + +static void on_fpe(int sig) +{ + (void)sig; + puts("SIGFPE: trapped FP exception (likely FE_INVALID from sqrt on a negative/sNaN lane)"); + fflush(stdout); + __builtin_abort (); +} + +int main(void) +{ + signal(SIGFPE, on_fpe); + + // Clear flags and enable trap on invalid operations. + feclearexcept(FE_ALL_EXCEPT); + feenableexcept(FE_INVALID); + + // Choose a length that is NOT a multiple of typical SVE VL (unknown at runtime), + // and includes plenty of inactive lanes. + const int n = 37; + + float *c = aligned_alloc(64, (size_t)n * sizeof(float)); + int *d = aligned_alloc(64, (size_t)n * sizeof(int)); + if (!c || !d) return 1; + + // Construct data: + // - For lanes where d<=1000, put negative values in c (sqrt would be FE_INVALID if executed). + // - For lanes where d>1000, put positive values in c (legal sqrt). + for (int i = 0; i < n; i++) { + if ((i % 3) == 0) { + d[i] = 1001; // active + c[i] = 4.0f; // sqrt OK + } else { + d[i] = 0; // inactive + c[i] = -1.0f; // sqrt would be invalid if wrongly executed + } + } + + // Call f. Correct behavior: no SIGFPE, and only positions with d>1000 are modified. + f(c, d, n); + + // If traps are unavailable, at least report raised flags. + int raised = fetestexcept(FE_ALL_EXCEPT); + if (raised) { + printf("FP flags raised: 0x%x\n", raised); + } else { + puts("No FP flags raised."); + } + + // Check results. + int ok = 1; + for (int i = 0; i < n; i++) { + if (d[i] > 1000) { + if (!(c[i] == 2.0f)) { // sqrt(4) = 2 + printf("Mismatch at %d: expected 2.0, got %g\n", i, c[i]); + ok = 0; + } + } else { + if (!(c[i] == -1.0f)) { // must remain unchanged + printf("Clobber at %d: expected -1.0 unchanged, got %g\n", i, c[i]); + ok = 0; + } + } + } + + puts(ok ? "OK" : "FAIL"); + free(c); + free(d); + return ok ? 0 : 2; +} + diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pr122103_2.c b/gcc/testsuite/gcc.target/aarch64/sve/pr122103_2.c new file mode 100644 index 000000000000..db2c1e70079d --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/pr122103_2.c @@ -0,0 +1,27 @@ +/* { dg-do compile } */ +/* { dg-options "-O3 -fno-math-errno -ftrapping-math -march=armv9-a" } */ +/* { dg-final { check-function-bodies "**" "" "" } } */ + +/* +** f: +** ... +** whilelo p([0-9]+).s, wzr, w[0-9]+ +** ... +** ld1w z[0-9]+.s, p\1/z, \[x[0-9]+, x[0-9]+, lsl 2\] +** cmpgt p\1.s, p\1/z, z[0-9]+.s, z[0-9]+.s +** ld1w z[0-9]+.s, p\1/z, \[x[0-9]+, x[0-9]+, lsl 2\] +** fsqrt z[0-9]+.s, p\1/m, z[0-9]+.s +** st1w z[0-9]+.s, p\1, \[x[0-9]+, x[0-9]+, lsl 2\] +** incw x[0-9]+ +** whilelo p\1.s, w[0-9]+, w[0-9]+ +** ... +*/ +void f (float *__restrict c, int *__restrict d, int n) +{ + for (int i = 0; i < n; i++) + { + if (d[i] > 1000) + c[i] = __builtin_sqrtf (c[i]); + } +} + diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pr122103_3.c b/gcc/testsuite/gcc.target/aarch64/sve/pr122103_3.c new file mode 100644 index 000000000000..7232101202b7 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/pr122103_3.c @@ -0,0 +1,27 @@ +/* { dg-do compile } */ +/* { dg-options "-O3 -fno-math-errno -fno-trapping-math -march=armv9-a" } */ +/* { dg-final { check-function-bodies "**" "" "" } } */ + +/* +** f: +** ... +** whilelo p([0-9]+).s, wzr, w[0-9]+ +** ... +** ld1w z[0-9]+.s, p\1/z, \[x[0-9]+, x[0-9]+, lsl 2\] +** cmpgt p\1.s, p\1/z, z[0-9]+.s, z[0-9]+.s +** ld1w z[0-9]+.s, p\1/z, \[x[0-9]+, x[0-9]+, lsl 2\] +** fsqrt z[0-9]+.s, p[0-9]+/m, z[0-9]+.s +** st1w z[0-9]+.s, p\1, \[x[0-9]+, x[0-9]+, lsl 2\] +** incw x[0-9]+ +** whilelo p\1.s, w[0-9]+, w[0-9]+ +** ... +*/ +void f (float *__restrict c, int *__restrict d, int n) +{ + for (int i = 0; i < n; i++) + { + if (d[i] > 1000) + c[i] = __builtin_sqrtf (c[i]); + } +} + diff --git a/gcc/tree-if-conv.cc b/gcc/tree-if-conv.cc index 21ffea1544cb..53227bd64c16 100644 --- a/gcc/tree-if-conv.cc +++ b/gcc/tree-if-conv.cc @@ -1006,6 +1006,19 @@ ifcvt_can_predicate (gimple *stmt) if (gimple_assign_single_p (stmt)) return ifcvt_can_use_mask_load_store (stmt); + if (gimple_call_builtin_p (stmt)) + if (tree callee = gimple_call_fndecl (stmt)) + { + auto ifn = associated_internal_fn (callee); + auto cond_ifn = get_conditional_internal_fn (ifn); + tree type = TREE_TYPE (gimple_call_fntype (stmt)); + return (cond_ifn != IFN_LAST + && vectorized_internal_fn_supported_p (cond_ifn, type)); + } + + if (!is_gimple_assign (stmt)) + return false; + tree_code code = gimple_assign_rhs_code (stmt); tree lhs_type = TREE_TYPE (gimple_assign_lhs (stmt)); tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt)); @@ -1150,6 +1163,23 @@ if_convertible_stmt_p (gimple *stmt, vec<data_reference_p> refs) } } + /* Check if the call can trap and if so require predication. */ + if (gimple_could_trap_p (stmt)) + { + if (ifcvt_can_predicate (stmt)) + { + gimple_set_plf (stmt, GF_PLF_2, true); + need_to_predicate = true; + return true; + } + else + { + if (dump_file && (dump_flags & TDF_DETAILS)) + fprintf (dump_file, "stmt could trap...\n"); + return false; + } + } + /* There are some IFN_s that are used to replace builtins but have the same semantics. Even if MASK_CALL cannot handle them vectorable_call will insert the proper selection, so do not block conversion. */ @@ -2840,20 +2870,38 @@ value_available_p (gimple *stmt, hash_set<tree_ssa_name_hash> *ssa_names, SSA names defined earlier in STMT's block. */ static gimple * -predicate_rhs_code (gassign *stmt, tree mask, tree cond, +predicate_rhs_code (gimple *stmt, tree mask, tree cond, hash_set<tree_ssa_name_hash> *ssa_names) { - tree lhs = gimple_assign_lhs (stmt); - tree_code code = gimple_assign_rhs_code (stmt); - unsigned int nops = gimple_num_ops (stmt); - internal_fn cond_fn = get_conditional_internal_fn (code); + internal_fn cond_fn; + if (is_gimple_assign (stmt)) + { + tree_code code = gimple_assign_rhs_code (stmt); + cond_fn = get_conditional_internal_fn (code); + } + else if (tree callee = gimple_call_fndecl (stmt)) + { + auto ifn = associated_internal_fn (callee); + cond_fn = get_conditional_internal_fn (ifn); + } + else + return NULL; + + if (cond_fn == IFN_LAST) + { + gcc_assert (!gimple_could_trap_p (stmt)); + return NULL; + } + + tree lhs = gimple_get_lhs (stmt); + unsigned int nops = gimple_num_args (stmt) + 1; /* Construct the arguments to the conditional internal function. */ auto_vec<tree, 8> args; args.safe_grow (nops + 1, true); args[0] = mask; - for (unsigned int i = 1; i < nops; ++i) - args[i] = gimple_op (stmt, i); + for (unsigned int i = 0; i < nops - 1; ++i) + args[i+1] = gimple_arg (stmt, i); args[nops] = NULL_TREE; /* Look for uses of the result to see whether they are COND_EXPRs that can @@ -3030,8 +3078,9 @@ predicate_statements (loop_p loop) for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);) { - gassign *stmt = dyn_cast <gassign *> (gsi_stmt (gsi)); - if (!stmt) + gimple *stmt = gsi_stmt (gsi); + if (!is_gimple_assign (stmt) + && !gimple_call_builtin_p (stmt)) ; else if (is_false_predicate (cond) && gimple_vdef (stmt)) @@ -3042,9 +3091,14 @@ predicate_statements (loop_p loop) continue; } else if (gimple_plf (stmt, GF_PLF_2) - && is_gimple_assign (stmt)) + && (is_gimple_assign (stmt) + || gimple_call_builtin_p (stmt))) { - tree lhs = gimple_assign_lhs (stmt); + tree lhs = gimple_get_lhs (stmt); + /* ?? Assume that calls without an LHS are not data processing + and so no issues with traps. */ + if (!lhs) + continue; tree mask; gimple *new_stmt; gimple_seq stmts = NULL; @@ -3080,11 +3134,14 @@ predicate_statements (loop_p loop) vect_masks.safe_push (mask); } if (gimple_assign_single_p (stmt)) - new_stmt = predicate_load_or_store (&gsi, stmt, mask); + new_stmt = predicate_load_or_store (&gsi, + as_a <gassign *> (stmt), + mask); else new_stmt = predicate_rhs_code (stmt, mask, cond, &ssa_names); - gsi_replace (&gsi, new_stmt, true); + if (new_stmt) + gsi_replace (&gsi, new_stmt, true); } else if (gimple_needing_rewrite_undefined (stmt)) rewrite_to_defined_unconditional (&gsi);
