On Wed, Sep 25, 2024 at 10:17:50AM +0800, Hongtao Liu wrote:
> > +           for (int i = 0; i < 2; ++i)
> > +             {
> > +               unsigned count = vector_cst_encoded_nelts (args[i]), j;
> > +               for (j = 0; j < count; ++j)
> > +                 if (!tree_expr_nan_p (VECTOR_CST_ENCODED_ELT (args[i], 
> > j)))
> Is this a typo? I assume you want to check if the component is NAN, so
> tree_expr_nan_p, not !tree_expr_nan_p?
> > +                   break;
> > +               if (j < count)
> > +                 break;
> Also this break just break the outer loop(for (int i = 0; i < 2;
> i++)), but according to comments, it wants to break the outer switch?

You're right, thanks for catching that.  Fortunately both meant just that
it got NaNs optimized too and optimized the rest as it should.

I just wanted to avoid return NULL_TREE; or goto and screwed it up.

Here is a fixed version, tested additionally on looking at gimple dump on
typedef float __v4sf __attribute__((vector_size (16)));
__v4sf foo (void) { return __builtin_ia32_minss ((__v4sf) { __builtin_nanf 
(""), 0.f, 0.f, 0.f }, (__v4sf) { __builtin_inff (), 1.0f, 2.0f, 3.0f }); }
__v4sf bar (void) { return __builtin_ia32_minss ((__v4sf) { -__builtin_inff (), 
0.f, 0.f, 0.f }, (__v4sf) { __builtin_inff (), 1.0f, 2.0f, 3.0f }); }

Ok for trunk if it passes bootstrap/regtest?

2024-09-25  Jakub Jelinek  <ja...@redhat.com>

        PR target/116738
        * config/i386/i386.cc (ix86_fold_builtin): Handle
        IX86_BUILTIN_M{IN,AX}{S,P}{S,H,D}*.
        (ix86_gimple_fold_builtin): Handle IX86_BUILTIN_M{IN,AX}P{S,H,D}*.

        * gcc.target/i386/avx512f-pr116738-1.c: New test.
        * gcc.target/i386/avx512f-pr116738-2.c: New test.

--- gcc/config/i386/i386.cc.jj  2024-09-24 18:54:24.120313544 +0200
+++ gcc/config/i386/i386.cc     2024-09-25 10:21:00.922417024 +0200
@@ -18507,6 +18507,8 @@ ix86_fold_builtin (tree fndecl, int n_ar
        = (enum ix86_builtins) DECL_MD_FUNCTION_CODE (fndecl);
       enum rtx_code rcode;
       bool is_vshift;
+      enum tree_code tcode;
+      bool is_scalar;
       unsigned HOST_WIDE_INT mask;
 
       switch (fn_code)
@@ -18956,6 +18958,131 @@ ix86_fold_builtin (tree fndecl, int n_ar
            }
          break;
 
+       case IX86_BUILTIN_MINSS:
+       case IX86_BUILTIN_MINSH_MASK:
+         tcode = LT_EXPR;
+         is_scalar = true;
+         goto do_minmax;
+
+       case IX86_BUILTIN_MAXSS:
+       case IX86_BUILTIN_MAXSH_MASK:
+         tcode = GT_EXPR;
+         is_scalar = true;
+         goto do_minmax;
+
+       case IX86_BUILTIN_MINPS:
+       case IX86_BUILTIN_MINPD:
+       case IX86_BUILTIN_MINPS256:
+       case IX86_BUILTIN_MINPD256:
+       case IX86_BUILTIN_MINPS512:
+       case IX86_BUILTIN_MINPD512:
+       case IX86_BUILTIN_MINPS128_MASK:
+       case IX86_BUILTIN_MINPD128_MASK:
+       case IX86_BUILTIN_MINPS256_MASK:
+       case IX86_BUILTIN_MINPD256_MASK:
+       case IX86_BUILTIN_MINPH128_MASK:
+       case IX86_BUILTIN_MINPH256_MASK:
+       case IX86_BUILTIN_MINPH512_MASK:
+         tcode = LT_EXPR;
+         is_scalar = false;
+         goto do_minmax;
+
+       case IX86_BUILTIN_MAXPS:
+       case IX86_BUILTIN_MAXPD:
+       case IX86_BUILTIN_MAXPS256:
+       case IX86_BUILTIN_MAXPD256:
+       case IX86_BUILTIN_MAXPS512:
+       case IX86_BUILTIN_MAXPD512:
+       case IX86_BUILTIN_MAXPS128_MASK:
+       case IX86_BUILTIN_MAXPD128_MASK:
+       case IX86_BUILTIN_MAXPS256_MASK:
+       case IX86_BUILTIN_MAXPD256_MASK:
+       case IX86_BUILTIN_MAXPH128_MASK:
+       case IX86_BUILTIN_MAXPH256_MASK:
+       case IX86_BUILTIN_MAXPH512_MASK:
+         tcode = GT_EXPR;
+         is_scalar = false;
+       do_minmax:
+         gcc_assert (n_args >= 2);
+         if (TREE_CODE (args[0]) != VECTOR_CST
+             || TREE_CODE (args[1]) != VECTOR_CST)
+           break;
+         mask = HOST_WIDE_INT_M1U;
+         if (n_args > 2)
+           {
+             gcc_assert (n_args >= 4);
+             /* This is masked minmax.  */
+             if (TREE_CODE (args[3]) != INTEGER_CST
+                 || TREE_SIDE_EFFECTS (args[2]))
+               break;
+             mask = TREE_INT_CST_LOW (args[3]);
+             unsigned elems = TYPE_VECTOR_SUBPARTS (TREE_TYPE (args[0]));
+             mask |= HOST_WIDE_INT_M1U << elems;
+             if (mask != HOST_WIDE_INT_M1U
+                 && TREE_CODE (args[2]) != VECTOR_CST)
+               break;
+             if (n_args >= 5)
+               {
+                 if (!tree_fits_uhwi_p (args[4]))
+                   break;
+                 if (tree_to_uhwi (args[4]) != 4
+                     && tree_to_uhwi (args[4]) != 8)
+                   break;
+               }
+             if (mask == (HOST_WIDE_INT_M1U << elems))
+               return args[2];
+           }
+         /* Punt on NaNs, unless exceptions are disabled.  */
+         if (HONOR_NANS (args[0])
+             && (n_args < 5 || tree_to_uhwi (args[4]) != 8))
+           for (int i = 0; i < 2; ++i)
+             {
+               unsigned count = vector_cst_encoded_nelts (args[i]);
+               for (unsigned j = 0; j < count; ++j)
+                 if (tree_expr_nan_p (VECTOR_CST_ENCODED_ELT (args[i], j)))
+                   return NULL_TREE;
+             }
+         {
+           tree res = const_binop (tcode,
+                                   truth_type_for (TREE_TYPE (args[0])),
+                                   args[0], args[1]);
+           if (res == NULL_TREE || TREE_CODE (res) != VECTOR_CST)
+             break;
+           res = fold_ternary (VEC_COND_EXPR, TREE_TYPE (args[0]), res,
+                               args[0], args[1]);
+           if (res == NULL_TREE || TREE_CODE (res) != VECTOR_CST)
+             break;
+           if (mask != HOST_WIDE_INT_M1U)
+             {
+               unsigned nelts = TYPE_VECTOR_SUBPARTS (TREE_TYPE (args[0]));
+               vec_perm_builder sel (nelts, nelts, 1);
+               for (unsigned int i = 0; i < nelts; i++)
+                 if (mask & (HOST_WIDE_INT_1U << i))
+                   sel.quick_push (i);
+                 else
+                   sel.quick_push (nelts + i);
+               vec_perm_indices indices (sel, 2, nelts);
+               res = fold_vec_perm (TREE_TYPE (args[0]), res, args[2],
+                                    indices);
+               if (res == NULL_TREE || TREE_CODE (res) != VECTOR_CST)
+                 break;
+             }
+           if (is_scalar)
+             {
+               unsigned nelts = TYPE_VECTOR_SUBPARTS (TREE_TYPE (args[0]));
+               vec_perm_builder sel (nelts, nelts, 1);
+               sel.quick_push (0);
+               for (unsigned int i = 1; i < nelts; i++)
+                 sel.quick_push (nelts + i);
+               vec_perm_indices indices (sel, 2, nelts);
+               res = fold_vec_perm (TREE_TYPE (args[0]), res, args[0],
+                                    indices);
+               if (res == NULL_TREE || TREE_CODE (res) != VECTOR_CST)
+                 break;
+             }
+           return res;
+         }
+
        default:
          break;
        }
@@ -19500,6 +19627,74 @@ ix86_gimple_fold_builtin (gimple_stmt_it
        gsi_replace (gsi, g, false);
       }
       return true;
+
+    case IX86_BUILTIN_MINPS:
+    case IX86_BUILTIN_MINPD:
+    case IX86_BUILTIN_MINPS256:
+    case IX86_BUILTIN_MINPD256:
+    case IX86_BUILTIN_MINPS512:
+    case IX86_BUILTIN_MINPD512:
+    case IX86_BUILTIN_MINPS128_MASK:
+    case IX86_BUILTIN_MINPD128_MASK:
+    case IX86_BUILTIN_MINPS256_MASK:
+    case IX86_BUILTIN_MINPD256_MASK:
+    case IX86_BUILTIN_MINPH128_MASK:
+    case IX86_BUILTIN_MINPH256_MASK:
+    case IX86_BUILTIN_MINPH512_MASK:
+      tcode = LT_EXPR;
+      goto do_minmax;
+
+    case IX86_BUILTIN_MAXPS:
+    case IX86_BUILTIN_MAXPD:
+    case IX86_BUILTIN_MAXPS256:
+    case IX86_BUILTIN_MAXPD256:
+    case IX86_BUILTIN_MAXPS512:
+    case IX86_BUILTIN_MAXPD512:
+    case IX86_BUILTIN_MAXPS128_MASK:
+    case IX86_BUILTIN_MAXPD128_MASK:
+    case IX86_BUILTIN_MAXPS256_MASK:
+    case IX86_BUILTIN_MAXPD256_MASK:
+    case IX86_BUILTIN_MAXPH128_MASK:
+    case IX86_BUILTIN_MAXPH256_MASK:
+    case IX86_BUILTIN_MAXPH512_MASK:
+      tcode = GT_EXPR;
+    do_minmax:
+      gcc_assert (n_args >= 2);
+      /* Without SSE4.1 we often aren't able to pattern match it back to the
+        desired instruction.  */
+      if (!gimple_call_lhs (stmt) || !optimize || !TARGET_SSE4_1)
+       break;
+      arg0 = gimple_call_arg (stmt, 0);
+      arg1 = gimple_call_arg (stmt, 1);
+      elems = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0));
+      /* For masked minmax, only optimize if the mask is all ones.  */
+      if (n_args > 2
+         && !ix86_masked_all_ones (elems, gimple_call_arg (stmt, 3)))
+       break;
+      if (n_args >= 5)
+       {
+         tree arg4 = gimple_call_arg (stmt, 4);
+         if (!tree_fits_uhwi_p (arg4))
+           break;
+         if (tree_to_uhwi (arg4) == 4)
+           /* Ok.  */;
+         else if (tree_to_uhwi (arg4) != 8)
+           /* Invalid round argument.  */
+           break;
+         else if (HONOR_NANS (arg0))
+           /* Lowering to comparison would raise exceptions which
+              shouldn't be raised.  */
+           break;
+       }
+      {
+       tree type = truth_type_for (TREE_TYPE (arg0));
+       tree cmpres = gimple_build (&stmts, tcode, type, arg0, arg1);
+       gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
+       g = gimple_build_assign (gimple_call_lhs (stmt),
+                                VEC_COND_EXPR, cmpres, arg0, arg1);
+       gsi_replace (gsi, g, false);
+      }
+      return true;
 
     default:
       break;
--- gcc/testsuite/gcc.target/i386/avx512f-pr116738-1.c.jj       2024-09-25 
10:19:40.925513841 +0200
+++ gcc/testsuite/gcc.target/i386/avx512f-pr116738-1.c  2024-09-25 
10:19:40.925513841 +0200
@@ -0,0 +1,56 @@
+/* PR target/116738 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx512f -fdump-tree-optimized" } */
+/* { dg-final { scan-tree-dump-not "__builtin_ia32_min" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "__builtin_ia32_max" "optimized" } } */
+
+#include <x86intrin.h>
+
+void
+test_pr116738 (void)
+{
+  __m512 a = _mm512_setr_ps (1.f, 2.f, 0.f, -0.f, -0.f, 0.f, 5.f, 6.f, 7.f,
+                            8.f, 9.f, 10.f, 11.f, -__builtin_inff (),
+                            __builtin_inff (), -42.f);
+  __m512 b = _mm512_setr_ps (-0.f, 3.f, -0.f, 0.f, -0.f, 0.f, 5.f, 5.f, 8.f,
+                            7.f, 10.f, -9.f, 12.f, 0.f, -0.f, 42.f);
+  __m512 w = _mm512_setr_ps (4.f, 5.f, 6.f, 7.f, 8.f, 9.f, 10.f, 0.f, 1.f,
+                            2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f);
+  __m512 c = _mm512_mask_min_ps (w, -1, a, b);
+  __m512 d = _mm512_mask_min_ps (w, 18658, a, b);
+  __m512 e = _mm512_mask_min_ps (w, 54649, a, b);
+  __m512 f = _mm512_mask_max_ps (w, -1, a, b);
+  __m512 g = _mm512_mask_max_ps (w, 18658, a, b);
+  __m512 h = _mm512_mask_max_ps (w, 54649, a, b);
+  __m128 i = _mm_setr_ps (1.f, 2.f, 0.f, -0.f);
+  __m128 j = _mm_setr_ps (-0.f, 3.f, -0.f, 0.f);
+  __m128 k = _mm_min_ss (i, j);
+  __m128 l = _mm_max_ss (j, i);
+  __m512 ce = _mm512_setr_ps (-0.f, 2.f, -0.f, 0.f, -0.f, 0.f, 5.f, 5.f, 7.f,
+                             7.f, 9.f, -9.f, 11.f, -__builtin_inff (),
+                             -0.f, -42.f);
+  __m512 de = _mm512_setr_ps (4.f, 2.f, 6.f, 7.f, 8.f, 0.f, 5.f, 5.f, 1.f,
+                             2.f, 3.f, -9.f, 5.f, 6.f, -0.f, 8.f); 
+  __m512 ee = _mm512_setr_ps (-0.f, 5.f, 6.f, 0.f, -0.f, 0.f, 5.f, 0.f, 7.f,
+                             2.f, 9.f, 4.f, 11.f, 6.f, -0.f, -42.f);
+  __m512 fe = _mm512_setr_ps (1.f, 3.f, -0.f, 0.f, -0.f, 0.f, 5.f, 6.f, 8.f,
+                             8.f, 10.f, 10.f, 12.f, 0.f, __builtin_inff (),
+                             42.f);
+  __m512 ge = _mm512_setr_ps (4.f, 3.f, 6.f, 7.f, 8.f, 0.f, 5.f, 6.f, 1.f,
+                             2.f, 3.f, 10.f, 5.f, 6.f, __builtin_inff (),
+                             8.f);
+  __m512 he = _mm512_setr_ps (1.f, 5.f, 6.f, 0.f, -0.f, 0.f, 5.f, 0.f, 8.f,
+                             2.f, 10.f, 4.f, 12.f, 6.f, __builtin_inff (),
+                             42.f);
+  __m128 ke = _mm_setr_ps (-0.f, 2.f, 0.f, -0.f);
+  __m128 le = _mm_setr_ps (1.f, 3.f, -0.f, 0.f);
+  if (__builtin_memcmp (&c, &ce, sizeof (c))
+      || __builtin_memcmp (&d, &de, sizeof (d))
+      || __builtin_memcmp (&e, &ee, sizeof (e))
+      || __builtin_memcmp (&f, &fe, sizeof (f))
+      || __builtin_memcmp (&g, &ge, sizeof (g))
+      || __builtin_memcmp (&h, &he, sizeof (h))
+      || __builtin_memcmp (&k, &ke, sizeof (k))
+      || __builtin_memcmp (&l, &le, sizeof (l)))
+    __builtin_abort ();
+}
--- gcc/testsuite/gcc.target/i386/avx512f-pr116738-2.c.jj       2024-09-25 
10:19:40.925513841 +0200
+++ gcc/testsuite/gcc.target/i386/avx512f-pr116738-2.c  2024-09-25 
10:19:40.925513841 +0200
@@ -0,0 +1,15 @@
+/* PR target/116738 */
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx512f" } */
+/* { dg-require-effective-target avx512f } */
+
+#define AVX512F
+#include "avx512f-helper.h"
+
+#include "avx512f-pr116738-1.c"
+
+void
+TEST (void)
+{
+  test_pr116738 ();
+}

        Jakub

Reply via email to