On Wed, Sep 25, 2024 at 10:49 AM Hongtao Liu <crazy...@gmail.com> wrote:
>
> On Wed, Sep 25, 2024 at 4:42 PM Jakub Jelinek <ja...@redhat.com> wrote:
> >
> > On Wed, Sep 25, 2024 at 10:17:50AM +0800, Hongtao Liu wrote:
> > > > +           for (int i = 0; i < 2; ++i)
> > > > +             {
> > > > +               unsigned count = vector_cst_encoded_nelts (args[i]), j;
> > > > +               for (j = 0; j < count; ++j)
> > > > +                 if (!tree_expr_nan_p (VECTOR_CST_ENCODED_ELT 
> > > > (args[i], j)))
> > > Is this a typo? I assume you want to check if the component is NAN, so
> > > tree_expr_nan_p, not !tree_expr_nan_p?
> > > > +                   break;
> > > > +               if (j < count)
> > > > +                 break;
> > > Also this break just break the outer loop(for (int i = 0; i < 2;
> > > i++)), but according to comments, it wants to break the outer switch?
> >
> > You're right, thanks for catching that.  Fortunately both meant just that
> > it got NaNs optimized too and optimized the rest as it should.
> >
> > I just wanted to avoid return NULL_TREE; or goto and screwed it up.
> >
> > Here is a fixed version, tested additionally on looking at gimple dump on
> > typedef float __v4sf __attribute__((vector_size (16)));
> > __v4sf foo (void) { return __builtin_ia32_minss ((__v4sf) { __builtin_nanf 
> > (""), 0.f, 0.f, 0.f }, (__v4sf) { __builtin_inff (), 1.0f, 2.0f, 3.0f }); }
> > __v4sf bar (void) { return __builtin_ia32_minss ((__v4sf) { -__builtin_inff 
> > (), 0.f, 0.f, 0.f }, (__v4sf) { __builtin_inff (), 1.0f, 2.0f, 3.0f }); }
> >
> > Ok for trunk if it passes bootstrap/regtest?
> Ok.

I'll note it would be much simpler if we could write x > y ? x : y in
the intrinsic header.

Yeah - need to ping that C FE patch to allow vector ?: again ...

Richard.

> >
> > 2024-09-25  Jakub Jelinek  <ja...@redhat.com>
> >
> >         PR target/116738
> >         * config/i386/i386.cc (ix86_fold_builtin): Handle
> >         IX86_BUILTIN_M{IN,AX}{S,P}{S,H,D}*.
> >         (ix86_gimple_fold_builtin): Handle IX86_BUILTIN_M{IN,AX}P{S,H,D}*.
> >
> >         * gcc.target/i386/avx512f-pr116738-1.c: New test.
> >         * gcc.target/i386/avx512f-pr116738-2.c: New test.
> >
> > --- gcc/config/i386/i386.cc.jj  2024-09-24 18:54:24.120313544 +0200
> > +++ gcc/config/i386/i386.cc     2024-09-25 10:21:00.922417024 +0200
> > @@ -18507,6 +18507,8 @@ ix86_fold_builtin (tree fndecl, int n_ar
> >         = (enum ix86_builtins) DECL_MD_FUNCTION_CODE (fndecl);
> >        enum rtx_code rcode;
> >        bool is_vshift;
> > +      enum tree_code tcode;
> > +      bool is_scalar;
> >        unsigned HOST_WIDE_INT mask;
> >
> >        switch (fn_code)
> > @@ -18956,6 +18958,131 @@ ix86_fold_builtin (tree fndecl, int n_ar
> >             }
> >           break;
> >
> > +       case IX86_BUILTIN_MINSS:
> > +       case IX86_BUILTIN_MINSH_MASK:
> > +         tcode = LT_EXPR;
> > +         is_scalar = true;
> > +         goto do_minmax;
> > +
> > +       case IX86_BUILTIN_MAXSS:
> > +       case IX86_BUILTIN_MAXSH_MASK:
> > +         tcode = GT_EXPR;
> > +         is_scalar = true;
> > +         goto do_minmax;
> > +
> > +       case IX86_BUILTIN_MINPS:
> > +       case IX86_BUILTIN_MINPD:
> > +       case IX86_BUILTIN_MINPS256:
> > +       case IX86_BUILTIN_MINPD256:
> > +       case IX86_BUILTIN_MINPS512:
> > +       case IX86_BUILTIN_MINPD512:
> > +       case IX86_BUILTIN_MINPS128_MASK:
> > +       case IX86_BUILTIN_MINPD128_MASK:
> > +       case IX86_BUILTIN_MINPS256_MASK:
> > +       case IX86_BUILTIN_MINPD256_MASK:
> > +       case IX86_BUILTIN_MINPH128_MASK:
> > +       case IX86_BUILTIN_MINPH256_MASK:
> > +       case IX86_BUILTIN_MINPH512_MASK:
> > +         tcode = LT_EXPR;
> > +         is_scalar = false;
> > +         goto do_minmax;
> > +
> > +       case IX86_BUILTIN_MAXPS:
> > +       case IX86_BUILTIN_MAXPD:
> > +       case IX86_BUILTIN_MAXPS256:
> > +       case IX86_BUILTIN_MAXPD256:
> > +       case IX86_BUILTIN_MAXPS512:
> > +       case IX86_BUILTIN_MAXPD512:
> > +       case IX86_BUILTIN_MAXPS128_MASK:
> > +       case IX86_BUILTIN_MAXPD128_MASK:
> > +       case IX86_BUILTIN_MAXPS256_MASK:
> > +       case IX86_BUILTIN_MAXPD256_MASK:
> > +       case IX86_BUILTIN_MAXPH128_MASK:
> > +       case IX86_BUILTIN_MAXPH256_MASK:
> > +       case IX86_BUILTIN_MAXPH512_MASK:
> > +         tcode = GT_EXPR;
> > +         is_scalar = false;
> > +       do_minmax:
> > +         gcc_assert (n_args >= 2);
> > +         if (TREE_CODE (args[0]) != VECTOR_CST
> > +             || TREE_CODE (args[1]) != VECTOR_CST)
> > +           break;
> > +         mask = HOST_WIDE_INT_M1U;
> > +         if (n_args > 2)
> > +           {
> > +             gcc_assert (n_args >= 4);
> > +             /* This is masked minmax.  */
> > +             if (TREE_CODE (args[3]) != INTEGER_CST
> > +                 || TREE_SIDE_EFFECTS (args[2]))
> > +               break;
> > +             mask = TREE_INT_CST_LOW (args[3]);
> > +             unsigned elems = TYPE_VECTOR_SUBPARTS (TREE_TYPE (args[0]));
> > +             mask |= HOST_WIDE_INT_M1U << elems;
> > +             if (mask != HOST_WIDE_INT_M1U
> > +                 && TREE_CODE (args[2]) != VECTOR_CST)
> > +               break;
> > +             if (n_args >= 5)
> > +               {
> > +                 if (!tree_fits_uhwi_p (args[4]))
> > +                   break;
> > +                 if (tree_to_uhwi (args[4]) != 4
> > +                     && tree_to_uhwi (args[4]) != 8)
> > +                   break;
> > +               }
> > +             if (mask == (HOST_WIDE_INT_M1U << elems))
> > +               return args[2];
> > +           }
> > +         /* Punt on NaNs, unless exceptions are disabled.  */
> > +         if (HONOR_NANS (args[0])
> > +             && (n_args < 5 || tree_to_uhwi (args[4]) != 8))
> > +           for (int i = 0; i < 2; ++i)
> > +             {
> > +               unsigned count = vector_cst_encoded_nelts (args[i]);
> > +               for (unsigned j = 0; j < count; ++j)
> > +                 if (tree_expr_nan_p (VECTOR_CST_ENCODED_ELT (args[i], j)))
> > +                   return NULL_TREE;
> > +             }
> > +         {
> > +           tree res = const_binop (tcode,
> > +                                   truth_type_for (TREE_TYPE (args[0])),
> > +                                   args[0], args[1]);
> > +           if (res == NULL_TREE || TREE_CODE (res) != VECTOR_CST)
> > +             break;
> > +           res = fold_ternary (VEC_COND_EXPR, TREE_TYPE (args[0]), res,
> > +                               args[0], args[1]);
> > +           if (res == NULL_TREE || TREE_CODE (res) != VECTOR_CST)
> > +             break;
> > +           if (mask != HOST_WIDE_INT_M1U)
> > +             {
> > +               unsigned nelts = TYPE_VECTOR_SUBPARTS (TREE_TYPE (args[0]));
> > +               vec_perm_builder sel (nelts, nelts, 1);
> > +               for (unsigned int i = 0; i < nelts; i++)
> > +                 if (mask & (HOST_WIDE_INT_1U << i))
> > +                   sel.quick_push (i);
> > +                 else
> > +                   sel.quick_push (nelts + i);
> > +               vec_perm_indices indices (sel, 2, nelts);
> > +               res = fold_vec_perm (TREE_TYPE (args[0]), res, args[2],
> > +                                    indices);
> > +               if (res == NULL_TREE || TREE_CODE (res) != VECTOR_CST)
> > +                 break;
> > +             }
> > +           if (is_scalar)
> > +             {
> > +               unsigned nelts = TYPE_VECTOR_SUBPARTS (TREE_TYPE (args[0]));
> > +               vec_perm_builder sel (nelts, nelts, 1);
> > +               sel.quick_push (0);
> > +               for (unsigned int i = 1; i < nelts; i++)
> > +                 sel.quick_push (nelts + i);
> > +               vec_perm_indices indices (sel, 2, nelts);
> > +               res = fold_vec_perm (TREE_TYPE (args[0]), res, args[0],
> > +                                    indices);
> > +               if (res == NULL_TREE || TREE_CODE (res) != VECTOR_CST)
> > +                 break;
> > +             }
> > +           return res;
> > +         }
> > +
> >         default:
> >           break;
> >         }
> > @@ -19500,6 +19627,74 @@ ix86_gimple_fold_builtin (gimple_stmt_it
> >         gsi_replace (gsi, g, false);
> >        }
> >        return true;
> > +
> > +    case IX86_BUILTIN_MINPS:
> > +    case IX86_BUILTIN_MINPD:
> > +    case IX86_BUILTIN_MINPS256:
> > +    case IX86_BUILTIN_MINPD256:
> > +    case IX86_BUILTIN_MINPS512:
> > +    case IX86_BUILTIN_MINPD512:
> > +    case IX86_BUILTIN_MINPS128_MASK:
> > +    case IX86_BUILTIN_MINPD128_MASK:
> > +    case IX86_BUILTIN_MINPS256_MASK:
> > +    case IX86_BUILTIN_MINPD256_MASK:
> > +    case IX86_BUILTIN_MINPH128_MASK:
> > +    case IX86_BUILTIN_MINPH256_MASK:
> > +    case IX86_BUILTIN_MINPH512_MASK:
> > +      tcode = LT_EXPR;
> > +      goto do_minmax;
> > +
> > +    case IX86_BUILTIN_MAXPS:
> > +    case IX86_BUILTIN_MAXPD:
> > +    case IX86_BUILTIN_MAXPS256:
> > +    case IX86_BUILTIN_MAXPD256:
> > +    case IX86_BUILTIN_MAXPS512:
> > +    case IX86_BUILTIN_MAXPD512:
> > +    case IX86_BUILTIN_MAXPS128_MASK:
> > +    case IX86_BUILTIN_MAXPD128_MASK:
> > +    case IX86_BUILTIN_MAXPS256_MASK:
> > +    case IX86_BUILTIN_MAXPD256_MASK:
> > +    case IX86_BUILTIN_MAXPH128_MASK:
> > +    case IX86_BUILTIN_MAXPH256_MASK:
> > +    case IX86_BUILTIN_MAXPH512_MASK:
> > +      tcode = GT_EXPR;
> > +    do_minmax:
> > +      gcc_assert (n_args >= 2);
> > +      /* Without SSE4.1 we often aren't able to pattern match it back to 
> > the
> > +        desired instruction.  */
> > +      if (!gimple_call_lhs (stmt) || !optimize || !TARGET_SSE4_1)
> > +       break;
> > +      arg0 = gimple_call_arg (stmt, 0);
> > +      arg1 = gimple_call_arg (stmt, 1);
> > +      elems = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0));
> > +      /* For masked minmax, only optimize if the mask is all ones.  */
> > +      if (n_args > 2
> > +         && !ix86_masked_all_ones (elems, gimple_call_arg (stmt, 3)))
> > +       break;
> > +      if (n_args >= 5)
> > +       {
> > +         tree arg4 = gimple_call_arg (stmt, 4);
> > +         if (!tree_fits_uhwi_p (arg4))
> > +           break;
> > +         if (tree_to_uhwi (arg4) == 4)
> > +           /* Ok.  */;
> > +         else if (tree_to_uhwi (arg4) != 8)
> > +           /* Invalid round argument.  */
> > +           break;
> > +         else if (HONOR_NANS (arg0))
> > +           /* Lowering to comparison would raise exceptions which
> > +              shouldn't be raised.  */
> > +           break;
> > +       }
> > +      {
> > +       tree type = truth_type_for (TREE_TYPE (arg0));
> > +       tree cmpres = gimple_build (&stmts, tcode, type, arg0, arg1);
> > +       gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
> > +       g = gimple_build_assign (gimple_call_lhs (stmt),
> > +                                VEC_COND_EXPR, cmpres, arg0, arg1);
> > +       gsi_replace (gsi, g, false);
> > +      }
> > +      return true;
> >
> >      default:
> >        break;
> > --- gcc/testsuite/gcc.target/i386/avx512f-pr116738-1.c.jj       2024-09-25 
> > 10:19:40.925513841 +0200
> > +++ gcc/testsuite/gcc.target/i386/avx512f-pr116738-1.c  2024-09-25 
> > 10:19:40.925513841 +0200
> > @@ -0,0 +1,56 @@
> > +/* PR target/116738 */
> > +/* { dg-do compile } */
> > +/* { dg-options "-O2 -mavx512f -fdump-tree-optimized" } */
> > +/* { dg-final { scan-tree-dump-not "__builtin_ia32_min" "optimized" } } */
> > +/* { dg-final { scan-tree-dump-not "__builtin_ia32_max" "optimized" } } */
> > +
> > +#include <x86intrin.h>
> > +
> > +void
> > +test_pr116738 (void)
> > +{
> > +  __m512 a = _mm512_setr_ps (1.f, 2.f, 0.f, -0.f, -0.f, 0.f, 5.f, 6.f, 7.f,
> > +                            8.f, 9.f, 10.f, 11.f, -__builtin_inff (),
> > +                            __builtin_inff (), -42.f);
> > +  __m512 b = _mm512_setr_ps (-0.f, 3.f, -0.f, 0.f, -0.f, 0.f, 5.f, 5.f, 
> > 8.f,
> > +                            7.f, 10.f, -9.f, 12.f, 0.f, -0.f, 42.f);
> > +  __m512 w = _mm512_setr_ps (4.f, 5.f, 6.f, 7.f, 8.f, 9.f, 10.f, 0.f, 1.f,
> > +                            2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f);
> > +  __m512 c = _mm512_mask_min_ps (w, -1, a, b);
> > +  __m512 d = _mm512_mask_min_ps (w, 18658, a, b);
> > +  __m512 e = _mm512_mask_min_ps (w, 54649, a, b);
> > +  __m512 f = _mm512_mask_max_ps (w, -1, a, b);
> > +  __m512 g = _mm512_mask_max_ps (w, 18658, a, b);
> > +  __m512 h = _mm512_mask_max_ps (w, 54649, a, b);
> > +  __m128 i = _mm_setr_ps (1.f, 2.f, 0.f, -0.f);
> > +  __m128 j = _mm_setr_ps (-0.f, 3.f, -0.f, 0.f);
> > +  __m128 k = _mm_min_ss (i, j);
> > +  __m128 l = _mm_max_ss (j, i);
> > +  __m512 ce = _mm512_setr_ps (-0.f, 2.f, -0.f, 0.f, -0.f, 0.f, 5.f, 5.f, 
> > 7.f,
> > +                             7.f, 9.f, -9.f, 11.f, -__builtin_inff (),
> > +                             -0.f, -42.f);
> > +  __m512 de = _mm512_setr_ps (4.f, 2.f, 6.f, 7.f, 8.f, 0.f, 5.f, 5.f, 1.f,
> > +                             2.f, 3.f, -9.f, 5.f, 6.f, -0.f, 8.f);
> > +  __m512 ee = _mm512_setr_ps (-0.f, 5.f, 6.f, 0.f, -0.f, 0.f, 5.f, 0.f, 
> > 7.f,
> > +                             2.f, 9.f, 4.f, 11.f, 6.f, -0.f, -42.f);
> > +  __m512 fe = _mm512_setr_ps (1.f, 3.f, -0.f, 0.f, -0.f, 0.f, 5.f, 6.f, 
> > 8.f,
> > +                             8.f, 10.f, 10.f, 12.f, 0.f, __builtin_inff (),
> > +                             42.f);
> > +  __m512 ge = _mm512_setr_ps (4.f, 3.f, 6.f, 7.f, 8.f, 0.f, 5.f, 6.f, 1.f,
> > +                             2.f, 3.f, 10.f, 5.f, 6.f, __builtin_inff (),
> > +                             8.f);
> > +  __m512 he = _mm512_setr_ps (1.f, 5.f, 6.f, 0.f, -0.f, 0.f, 5.f, 0.f, 8.f,
> > +                             2.f, 10.f, 4.f, 12.f, 6.f, __builtin_inff (),
> > +                             42.f);
> > +  __m128 ke = _mm_setr_ps (-0.f, 2.f, 0.f, -0.f);
> > +  __m128 le = _mm_setr_ps (1.f, 3.f, -0.f, 0.f);
> > +  if (__builtin_memcmp (&c, &ce, sizeof (c))
> > +      || __builtin_memcmp (&d, &de, sizeof (d))
> > +      || __builtin_memcmp (&e, &ee, sizeof (e))
> > +      || __builtin_memcmp (&f, &fe, sizeof (f))
> > +      || __builtin_memcmp (&g, &ge, sizeof (g))
> > +      || __builtin_memcmp (&h, &he, sizeof (h))
> > +      || __builtin_memcmp (&k, &ke, sizeof (k))
> > +      || __builtin_memcmp (&l, &le, sizeof (l)))
> > +    __builtin_abort ();
> > +}
> > --- gcc/testsuite/gcc.target/i386/avx512f-pr116738-2.c.jj       2024-09-25 
> > 10:19:40.925513841 +0200
> > +++ gcc/testsuite/gcc.target/i386/avx512f-pr116738-2.c  2024-09-25 
> > 10:19:40.925513841 +0200
> > @@ -0,0 +1,15 @@
> > +/* PR target/116738 */
> > +/* { dg-do run } */
> > +/* { dg-options "-O2 -mavx512f" } */
> > +/* { dg-require-effective-target avx512f } */
> > +
> > +#define AVX512F
> > +#include "avx512f-helper.h"
> > +
> > +#include "avx512f-pr116738-1.c"
> > +
> > +void
> > +TEST (void)
> > +{
> > +  test_pr116738 ();
> > +}
> >
> >         Jakub
> >
>
>
> --
> BR,
> Hongtao

Reply via email to