[gcc r16-6511] vect: teach vectorizable_call to predicate calls when they can trap [PR122103]

Tamar Christina via Gcc-cvs Mon, 05 Jan 2026 12:59:33 -0800

https://gcc.gnu.org/g:974c04dc2cb7f44705a9fd62b3b9592d7c6faca3


commit r16-6511-g974c04dc2cb7f44705a9fd62b3b9592d7c6faca3
Author: Tamar Christina <[email protected]>
Date:   Mon Jan 5 20:56:03 2026 +0000

    vect: teach vectorizable_call to predicate calls when they can trap 
[PR122103]
    
    The following example
    
    void f (float *__restrict c, int *__restrict d, int n)
    {
        for (int i = 0; i < n; i++)
        {
          c[i] = __builtin_sqrtf (c[i]);
        }
    }
    
    compiled with -O3 -march=armv9-a -fno-math-errno -ftrapping-math needs to be
    predicated on the conditional.  It's invalid to execute the branch and use a
    select to extract it later unless using -fno-trapping-math.
    
    We currently generate:
    
    f:
            cmp     w2, 0
            ble     .L1
            mov     x1, 0
            whilelo p7.s, wzr, w2
            ptrue   p6.b, all
    .L3:
            ld1w    z31.s, p7/z, [x0, x1, lsl 2]
            fsqrt   z31.s, p6/m, z31.s
            st1w    z31.s, p7, [x0, x1, lsl 2]
            incw    x1
            whilelo p7.s, w1, w2
            b.any   .L3
    .L1:
            ret
    
    Which means the inactive lanes of the operation can raise an FE.  With this
    change we now generate
    
    f:
            cmp     w2, 0
            ble     .L1
            mov     x1, 0
            whilelo p7.s, wzr, w2
            .p2align 5,,15
    .L3:
            ld1w    z31.s, p7/z, [x0, x1, lsl 2]
            fsqrt   z31.s, p7/m, z31.s
            st1w    z31.s, p7, [x0, x1, lsl 2]
            incw    x1
            whilelo p7.s, w1, w2
            b.any   .L3
    .L1:
            ret
    
    However as discussed in PR96373 while we probably shouldn't vectorize for 
the
    cases where we can trap but don't support conditional operation there 
doesn't
    seem to be a clear consensus on how GCC should handle trapping math.
    
    As such similar to PR96373 I don't stop vectorization if trapping math and
    the conditional operation isn't supported.
    
    gcc/ChangeLog:
    
            PR tree-optimization/122103
            * tree-vect-stmts.cc (vectorizable_call): Handle trapping math.
    
    gcc/testsuite/ChangeLog:
    
            PR tree-optimization/122103
            * gcc.target/aarch64/sve/pr122103_4.c: New test.
            * gcc.target/aarch64/sve/pr122103_5.c: New test.
            * gcc.target/aarch64/sve/pr122103_6.c: New test.

Diff:
---
 gcc/testsuite/gcc.target/aarch64/sve/pr122103_4.c | 78 +++++++++++++++++++++++
 gcc/testsuite/gcc.target/aarch64/sve/pr122103_5.c | 24 +++++++
 gcc/testsuite/gcc.target/aarch64/sve/pr122103_6.c | 23 +++++++
 gcc/tree-vect-stmts.cc                            | 52 ++++++++++++---
 4 files changed, 169 insertions(+), 8 deletions(-)

diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pr122103_4.c 
b/gcc/testsuite/gcc.target/aarch64/sve/pr122103_4.c
new file mode 100644
index 000000000000..cac485b5dade
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pr122103_4.c
@@ -0,0 +1,78 @@
+/* { dg-do run }  */
+/* { dg-require-effective-target glibc }  */
+/* { dg-options "-O3 -fno-math-errno -ftrapping-math -march=armv8-a+sve" }  */
+
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <math.h>
+#include <fenv.h>
+#include <signal.h>
+
+#pragma STDC FENV_ACCESS ON
+
+__attribute__((noinline))
+void f(float *__restrict c, int n)
+{
+  for (int i = 0; i < n; i++)
+    c[i] = __builtin_sqrtf (c[i] - 2.0f);
+}
+
+static void on_fpe (int sig)
+{
+  (void) sig;
+  puts ("SIGFPE: trapped FP exception (unexpected invalid from sqrt)");
+  fflush (stdout);
+  __builtin_abort ();
+}
+
+int
+main (void)
+{
+  signal (SIGFPE, on_fpe);
+
+  /* Clear flags and enable trap on invalid operations.  */
+  feclearexcept (FE_ALL_EXCEPT);
+  feenableexcept (FE_INVALID);
+
+  /* Choose a length that is NOT a multiple of typical SVE VL (unknown at
+     runtime), and includes plenty of extra lanes.  */
+  const int n = 37;
+
+  float *c = aligned_alloc (64, (size_t) n * sizeof (float));
+  if (!c)
+    return 1;
+
+  /* Populate c so that (c[i] - 2) is a perfect square; this avoids FE_INVALID
+     while giving deterministic results.  */
+  for (int i = 0; i < n; i++)
+    {
+      int t = i & 3;
+      c[i] = (float) (t * t) + 2.0f;
+    }
+
+  f (c, n);
+
+  /* Only FE_INVALID would indicate a wrong extra-lane computation here.  */
+  if (fetestexcept (FE_INVALID))
+    {
+      puts ("Unexpected FE_INVALID");
+      return 2;
+    }
+
+  int ok = 1;
+  for (int i = 0; i < n; i++)
+    {
+      float expected = (float) (i & 3);
+      if (!(c[i] == expected))
+       {
+         printf ("Mismatch at %d: expected %g, got %g\n", i, expected, c[i]);
+         ok = 0;
+       }
+    }
+
+  puts (ok ? "OK" : "FAIL");
+  free (c);
+  return ok ? 0 : 3;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pr122103_5.c 
b/gcc/testsuite/gcc.target/aarch64/sve/pr122103_5.c
new file mode 100644
index 000000000000..ca1bfb737b8b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pr122103_5.c
@@ -0,0 +1,24 @@
+/* { dg-do compile }  */
+/* { dg-options "-O3 -fno-math-errno -ftrapping-math -march=armv9-a" }  */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+/*
+** f:
+**     ...
+**     whilelo p([0-9]+).s, wzr, w[0-9]+
+**     ...
+**     ld1w    z[0-9]+.s, p\1/z, \[x[0-9]+, x[0-9]+, lsl 2\]
+**     fadd    z[0-9]+.s, p\1/m, z[0-9]+.s, z[0-9]+.s
+**     fsqrt   z[0-9]+.s, p\1/m, z[0-9]+.s
+**     st1w    z[0-9]+.s, p\1, \[x[0-9]+, x[0-9]+, lsl 2\]
+**     incw    x[0-9]+
+**     whilelo p\1.s, w[0-9]+, w[0-9]+
+**     ...
+*/
+void
+f (float *__restrict c, int n)
+{
+  for (int i = 0; i < n; i++)
+    c[i] = __builtin_sqrtf (c[i] - 2.0f);
+}
+
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pr122103_6.c 
b/gcc/testsuite/gcc.target/aarch64/sve/pr122103_6.c
new file mode 100644
index 000000000000..9c51121c7bf0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pr122103_6.c
@@ -0,0 +1,23 @@
+/* { dg-do compile }  */
+/* { dg-options "-O3 -fno-math-errno -fno-trapping-math -march=armv9-a" }  */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+/*
+** f:
+**     ...
+**     whilelo p([0-9]+).s, wzr, w[0-9]+
+**     ...
+**     ld1w    z[0-9]+.s, p\1/z, \[x[0-9]+, x[0-9]+, lsl 2\]
+**     fadd    z[0-9]+.s, z[0-9]+.s, z[0-9]+.s
+**     fsqrt   z[0-9]+.s, p[0-9]+/m, z[0-9]+.s
+**     st1w    z[0-9]+.s, p\1, \[x[0-9]+, x[0-9]+, lsl 2\]
+**     incw    x[0-9]+
+**     whilelo p\1.s, w[0-9]+, w[0-9]+
+**     ...
+*/
+void
+f (float *__restrict c, int n)
+{
+  for (int i = 0; i < n; i++)
+    c[i] = __builtin_sqrtf (c[i] - 2.0f);
+}
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index ed2486451ca0..8086d4c35c9a 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -3438,9 +3438,9 @@ vectorizable_call (vec_info *vinfo,
   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
   bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
   tree fndecl, new_temp, rhs_type;
-  enum vect_def_type dt[4]
+  enum vect_def_type dt[5]
     = { vect_unknown_def_type, vect_unknown_def_type, vect_unknown_def_type,
-       vect_unknown_def_type };
+       vect_unknown_def_type, vect_unknown_def_type };
   tree vectypes[ARRAY_SIZE (dt)] = {};
   slp_tree slp_op[ARRAY_SIZE (dt)] = {};
   auto_vec<tree, 8> vargs;
@@ -3481,8 +3481,8 @@ vectorizable_call (vec_info *vinfo,
 
   /* Bail out if the function has more than four arguments, we do not have
      interesting builtin functions to vectorize with more than two arguments
-     except for fma.  No arguments is also not good.  */
-  if (nargs == 0 || nargs > 4)
+     except for fma (cond_fma has more).  No arguments is also not good.  */
+  if (nargs == 0 || nargs > 5)
     return false;
 
   /* Ignore the arguments of IFN_GOMP_SIMD_LANE, they are magic.  */
@@ -3625,6 +3625,33 @@ vectorizable_call (vec_info *vinfo,
     ifn = vectorizable_internal_function (cfn, callee, vectype_out,
                                          vectype_in);
 
+  /* Check if the operation traps.  */
+  bool could_trap = gimple_could_trap_p (STMT_VINFO_STMT (stmt_info));
+  if (could_trap && cost_vec && loop_vinfo)
+    {
+      /* If the operation can trap it must be conditional, otherwise fail.  */
+      internal_fn cond_fn = get_conditional_internal_fn (ifn);
+      internal_fn cond_len_fn = get_len_internal_fn (ifn);
+      if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
+       {
+         /* We assume that BB SLP fills all lanes, so no inactive lanes can
+            cause issues.  */
+         if ((cond_fn == IFN_LAST
+              || !direct_internal_fn_supported_p (cond_fn, vectype_out,
+                                                  OPTIMIZE_FOR_SPEED))
+             && (cond_len_fn == IFN_LAST
+                 || !direct_internal_fn_supported_p (cond_len_fn, vectype_out,
+                                                     OPTIMIZE_FOR_SPEED)))
+           {
+             if (dump_enabled_p ())
+               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                                "can't use a fully-masked loop because no"
+                                " conditional operation is available.\n");
+             LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
+           }
+       }
+    }
+
   /* If that fails, try asking for a target-specific built-in function.  */
   if (ifn == IFN_LAST)
     {
@@ -3749,7 +3776,7 @@ vectorizable_call (vec_info *vinfo,
       else if (reduc_idx >= 0)
        gcc_unreachable ();
     }
-  else if (masked_loop_p && mask_opno == -1 && reduc_idx >= 0)
+  else if (masked_loop_p && mask_opno == -1 && (reduc_idx >= 0 || could_trap))
     {
       ifn = cond_fn;
       vect_nargs += 2;
@@ -3793,7 +3820,8 @@ vectorizable_call (vec_info *vinfo,
            {
              int varg = 0;
              /* Add the mask if necessary.  */
-             if (masked_loop_p && mask_opno == -1 && reduc_idx >= 0)
+             if (masked_loop_p && mask_opno == -1
+                 && (reduc_idx >= 0 || could_trap))
                {
                  gcc_assert (internal_fn_mask_index (ifn) == varg);
                  unsigned int vec_num = vec_oprnds0.length ();
@@ -3807,10 +3835,18 @@ vectorizable_call (vec_info *vinfo,
                  vargs[varg++] = vec_oprndsk[i];
                }
              /* Add the else value if necessary.  */
-             if (masked_loop_p && mask_opno == -1 && reduc_idx >= 0)
+             if (masked_loop_p && mask_opno == -1
+                && (reduc_idx >= 0 || could_trap))
                {
                  gcc_assert (internal_fn_else_index (ifn) == varg);
-                 vargs[varg++] = vargs[reduc_idx + 1];
+                 if (reduc_idx >= 0)
+                   vargs[varg++] = vargs[reduc_idx + 1];
+                 else
+                   {
+                     auto else_value = targetm.preferred_else_value
+                       (cond_fn, vectype_out, varg - 1, &vargs[1]);
+                     vargs[varg++] = else_value;
+                   }
                }
              if (clz_ctz_arg1)
                vargs[varg++] = clz_ctz_arg1;

[gcc r16-6511] vect: teach vectorizable_call to predicate calls when they can trap [PR122103]

Reply via email to