On Wed, Sep 25, 2024 at 10:49 AM Hongtao Liu <crazy...@gmail.com> wrote: > > On Wed, Sep 25, 2024 at 4:42 PM Jakub Jelinek <ja...@redhat.com> wrote: > > > > On Wed, Sep 25, 2024 at 10:17:50AM +0800, Hongtao Liu wrote: > > > > + for (int i = 0; i < 2; ++i) > > > > + { > > > > + unsigned count = vector_cst_encoded_nelts (args[i]), j; > > > > + for (j = 0; j < count; ++j) > > > > + if (!tree_expr_nan_p (VECTOR_CST_ENCODED_ELT > > > > (args[i], j))) > > > Is this a typo? I assume you want to check if the component is NAN, so > > > tree_expr_nan_p, not !tree_expr_nan_p? > > > > + break; > > > > + if (j < count) > > > > + break; > > > Also this break just break the outer loop(for (int i = 0; i < 2; > > > i++)), but according to comments, it wants to break the outer switch? > > > > You're right, thanks for catching that. Fortunately both meant just that > > it got NaNs optimized too and optimized the rest as it should. > > > > I just wanted to avoid return NULL_TREE; or goto and screwed it up. > > > > Here is a fixed version, tested additionally on looking at gimple dump on > > typedef float __v4sf __attribute__((vector_size (16))); > > __v4sf foo (void) { return __builtin_ia32_minss ((__v4sf) { __builtin_nanf > > (""), 0.f, 0.f, 0.f }, (__v4sf) { __builtin_inff (), 1.0f, 2.0f, 3.0f }); } > > __v4sf bar (void) { return __builtin_ia32_minss ((__v4sf) { -__builtin_inff > > (), 0.f, 0.f, 0.f }, (__v4sf) { __builtin_inff (), 1.0f, 2.0f, 3.0f }); } > > > > Ok for trunk if it passes bootstrap/regtest? > Ok.
I'll note it would be much simpler if we could write x > y ? x : y in the intrinsic header. Yeah - need to ping that C FE patch to allow vector ?: again ... Richard. > > > > 2024-09-25 Jakub Jelinek <ja...@redhat.com> > > > > PR target/116738 > > * config/i386/i386.cc (ix86_fold_builtin): Handle > > IX86_BUILTIN_M{IN,AX}{S,P}{S,H,D}*. > > (ix86_gimple_fold_builtin): Handle IX86_BUILTIN_M{IN,AX}P{S,H,D}*. > > > > * gcc.target/i386/avx512f-pr116738-1.c: New test. > > * gcc.target/i386/avx512f-pr116738-2.c: New test. > > > > --- gcc/config/i386/i386.cc.jj 2024-09-24 18:54:24.120313544 +0200 > > +++ gcc/config/i386/i386.cc 2024-09-25 10:21:00.922417024 +0200 > > @@ -18507,6 +18507,8 @@ ix86_fold_builtin (tree fndecl, int n_ar > > = (enum ix86_builtins) DECL_MD_FUNCTION_CODE (fndecl); > > enum rtx_code rcode; > > bool is_vshift; > > + enum tree_code tcode; > > + bool is_scalar; > > unsigned HOST_WIDE_INT mask; > > > > switch (fn_code) > > @@ -18956,6 +18958,131 @@ ix86_fold_builtin (tree fndecl, int n_ar > > } > > break; > > > > + case IX86_BUILTIN_MINSS: > > + case IX86_BUILTIN_MINSH_MASK: > > + tcode = LT_EXPR; > > + is_scalar = true; > > + goto do_minmax; > > + > > + case IX86_BUILTIN_MAXSS: > > + case IX86_BUILTIN_MAXSH_MASK: > > + tcode = GT_EXPR; > > + is_scalar = true; > > + goto do_minmax; > > + > > + case IX86_BUILTIN_MINPS: > > + case IX86_BUILTIN_MINPD: > > + case IX86_BUILTIN_MINPS256: > > + case IX86_BUILTIN_MINPD256: > > + case IX86_BUILTIN_MINPS512: > > + case IX86_BUILTIN_MINPD512: > > + case IX86_BUILTIN_MINPS128_MASK: > > + case IX86_BUILTIN_MINPD128_MASK: > > + case IX86_BUILTIN_MINPS256_MASK: > > + case IX86_BUILTIN_MINPD256_MASK: > > + case IX86_BUILTIN_MINPH128_MASK: > > + case IX86_BUILTIN_MINPH256_MASK: > > + case IX86_BUILTIN_MINPH512_MASK: > > + tcode = LT_EXPR; > > + is_scalar = false; > > + goto do_minmax; > > + > > + case IX86_BUILTIN_MAXPS: > > + case IX86_BUILTIN_MAXPD: > > + case IX86_BUILTIN_MAXPS256: > > + case IX86_BUILTIN_MAXPD256: > > + case IX86_BUILTIN_MAXPS512: > > + case IX86_BUILTIN_MAXPD512: > > + case IX86_BUILTIN_MAXPS128_MASK: > > + case IX86_BUILTIN_MAXPD128_MASK: > > + case IX86_BUILTIN_MAXPS256_MASK: > > + case IX86_BUILTIN_MAXPD256_MASK: > > + case IX86_BUILTIN_MAXPH128_MASK: > > + case IX86_BUILTIN_MAXPH256_MASK: > > + case IX86_BUILTIN_MAXPH512_MASK: > > + tcode = GT_EXPR; > > + is_scalar = false; > > + do_minmax: > > + gcc_assert (n_args >= 2); > > + if (TREE_CODE (args[0]) != VECTOR_CST > > + || TREE_CODE (args[1]) != VECTOR_CST) > > + break; > > + mask = HOST_WIDE_INT_M1U; > > + if (n_args > 2) > > + { > > + gcc_assert (n_args >= 4); > > + /* This is masked minmax. */ > > + if (TREE_CODE (args[3]) != INTEGER_CST > > + || TREE_SIDE_EFFECTS (args[2])) > > + break; > > + mask = TREE_INT_CST_LOW (args[3]); > > + unsigned elems = TYPE_VECTOR_SUBPARTS (TREE_TYPE (args[0])); > > + mask |= HOST_WIDE_INT_M1U << elems; > > + if (mask != HOST_WIDE_INT_M1U > > + && TREE_CODE (args[2]) != VECTOR_CST) > > + break; > > + if (n_args >= 5) > > + { > > + if (!tree_fits_uhwi_p (args[4])) > > + break; > > + if (tree_to_uhwi (args[4]) != 4 > > + && tree_to_uhwi (args[4]) != 8) > > + break; > > + } > > + if (mask == (HOST_WIDE_INT_M1U << elems)) > > + return args[2]; > > + } > > + /* Punt on NaNs, unless exceptions are disabled. */ > > + if (HONOR_NANS (args[0]) > > + && (n_args < 5 || tree_to_uhwi (args[4]) != 8)) > > + for (int i = 0; i < 2; ++i) > > + { > > + unsigned count = vector_cst_encoded_nelts (args[i]); > > + for (unsigned j = 0; j < count; ++j) > > + if (tree_expr_nan_p (VECTOR_CST_ENCODED_ELT (args[i], j))) > > + return NULL_TREE; > > + } > > + { > > + tree res = const_binop (tcode, > > + truth_type_for (TREE_TYPE (args[0])), > > + args[0], args[1]); > > + if (res == NULL_TREE || TREE_CODE (res) != VECTOR_CST) > > + break; > > + res = fold_ternary (VEC_COND_EXPR, TREE_TYPE (args[0]), res, > > + args[0], args[1]); > > + if (res == NULL_TREE || TREE_CODE (res) != VECTOR_CST) > > + break; > > + if (mask != HOST_WIDE_INT_M1U) > > + { > > + unsigned nelts = TYPE_VECTOR_SUBPARTS (TREE_TYPE (args[0])); > > + vec_perm_builder sel (nelts, nelts, 1); > > + for (unsigned int i = 0; i < nelts; i++) > > + if (mask & (HOST_WIDE_INT_1U << i)) > > + sel.quick_push (i); > > + else > > + sel.quick_push (nelts + i); > > + vec_perm_indices indices (sel, 2, nelts); > > + res = fold_vec_perm (TREE_TYPE (args[0]), res, args[2], > > + indices); > > + if (res == NULL_TREE || TREE_CODE (res) != VECTOR_CST) > > + break; > > + } > > + if (is_scalar) > > + { > > + unsigned nelts = TYPE_VECTOR_SUBPARTS (TREE_TYPE (args[0])); > > + vec_perm_builder sel (nelts, nelts, 1); > > + sel.quick_push (0); > > + for (unsigned int i = 1; i < nelts; i++) > > + sel.quick_push (nelts + i); > > + vec_perm_indices indices (sel, 2, nelts); > > + res = fold_vec_perm (TREE_TYPE (args[0]), res, args[0], > > + indices); > > + if (res == NULL_TREE || TREE_CODE (res) != VECTOR_CST) > > + break; > > + } > > + return res; > > + } > > + > > default: > > break; > > } > > @@ -19500,6 +19627,74 @@ ix86_gimple_fold_builtin (gimple_stmt_it > > gsi_replace (gsi, g, false); > > } > > return true; > > + > > + case IX86_BUILTIN_MINPS: > > + case IX86_BUILTIN_MINPD: > > + case IX86_BUILTIN_MINPS256: > > + case IX86_BUILTIN_MINPD256: > > + case IX86_BUILTIN_MINPS512: > > + case IX86_BUILTIN_MINPD512: > > + case IX86_BUILTIN_MINPS128_MASK: > > + case IX86_BUILTIN_MINPD128_MASK: > > + case IX86_BUILTIN_MINPS256_MASK: > > + case IX86_BUILTIN_MINPD256_MASK: > > + case IX86_BUILTIN_MINPH128_MASK: > > + case IX86_BUILTIN_MINPH256_MASK: > > + case IX86_BUILTIN_MINPH512_MASK: > > + tcode = LT_EXPR; > > + goto do_minmax; > > + > > + case IX86_BUILTIN_MAXPS: > > + case IX86_BUILTIN_MAXPD: > > + case IX86_BUILTIN_MAXPS256: > > + case IX86_BUILTIN_MAXPD256: > > + case IX86_BUILTIN_MAXPS512: > > + case IX86_BUILTIN_MAXPD512: > > + case IX86_BUILTIN_MAXPS128_MASK: > > + case IX86_BUILTIN_MAXPD128_MASK: > > + case IX86_BUILTIN_MAXPS256_MASK: > > + case IX86_BUILTIN_MAXPD256_MASK: > > + case IX86_BUILTIN_MAXPH128_MASK: > > + case IX86_BUILTIN_MAXPH256_MASK: > > + case IX86_BUILTIN_MAXPH512_MASK: > > + tcode = GT_EXPR; > > + do_minmax: > > + gcc_assert (n_args >= 2); > > + /* Without SSE4.1 we often aren't able to pattern match it back to > > the > > + desired instruction. */ > > + if (!gimple_call_lhs (stmt) || !optimize || !TARGET_SSE4_1) > > + break; > > + arg0 = gimple_call_arg (stmt, 0); > > + arg1 = gimple_call_arg (stmt, 1); > > + elems = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0)); > > + /* For masked minmax, only optimize if the mask is all ones. */ > > + if (n_args > 2 > > + && !ix86_masked_all_ones (elems, gimple_call_arg (stmt, 3))) > > + break; > > + if (n_args >= 5) > > + { > > + tree arg4 = gimple_call_arg (stmt, 4); > > + if (!tree_fits_uhwi_p (arg4)) > > + break; > > + if (tree_to_uhwi (arg4) == 4) > > + /* Ok. */; > > + else if (tree_to_uhwi (arg4) != 8) > > + /* Invalid round argument. */ > > + break; > > + else if (HONOR_NANS (arg0)) > > + /* Lowering to comparison would raise exceptions which > > + shouldn't be raised. */ > > + break; > > + } > > + { > > + tree type = truth_type_for (TREE_TYPE (arg0)); > > + tree cmpres = gimple_build (&stmts, tcode, type, arg0, arg1); > > + gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT); > > + g = gimple_build_assign (gimple_call_lhs (stmt), > > + VEC_COND_EXPR, cmpres, arg0, arg1); > > + gsi_replace (gsi, g, false); > > + } > > + return true; > > > > default: > > break; > > --- gcc/testsuite/gcc.target/i386/avx512f-pr116738-1.c.jj 2024-09-25 > > 10:19:40.925513841 +0200 > > +++ gcc/testsuite/gcc.target/i386/avx512f-pr116738-1.c 2024-09-25 > > 10:19:40.925513841 +0200 > > @@ -0,0 +1,56 @@ > > +/* PR target/116738 */ > > +/* { dg-do compile } */ > > +/* { dg-options "-O2 -mavx512f -fdump-tree-optimized" } */ > > +/* { dg-final { scan-tree-dump-not "__builtin_ia32_min" "optimized" } } */ > > +/* { dg-final { scan-tree-dump-not "__builtin_ia32_max" "optimized" } } */ > > + > > +#include <x86intrin.h> > > + > > +void > > +test_pr116738 (void) > > +{ > > + __m512 a = _mm512_setr_ps (1.f, 2.f, 0.f, -0.f, -0.f, 0.f, 5.f, 6.f, 7.f, > > + 8.f, 9.f, 10.f, 11.f, -__builtin_inff (), > > + __builtin_inff (), -42.f); > > + __m512 b = _mm512_setr_ps (-0.f, 3.f, -0.f, 0.f, -0.f, 0.f, 5.f, 5.f, > > 8.f, > > + 7.f, 10.f, -9.f, 12.f, 0.f, -0.f, 42.f); > > + __m512 w = _mm512_setr_ps (4.f, 5.f, 6.f, 7.f, 8.f, 9.f, 10.f, 0.f, 1.f, > > + 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f); > > + __m512 c = _mm512_mask_min_ps (w, -1, a, b); > > + __m512 d = _mm512_mask_min_ps (w, 18658, a, b); > > + __m512 e = _mm512_mask_min_ps (w, 54649, a, b); > > + __m512 f = _mm512_mask_max_ps (w, -1, a, b); > > + __m512 g = _mm512_mask_max_ps (w, 18658, a, b); > > + __m512 h = _mm512_mask_max_ps (w, 54649, a, b); > > + __m128 i = _mm_setr_ps (1.f, 2.f, 0.f, -0.f); > > + __m128 j = _mm_setr_ps (-0.f, 3.f, -0.f, 0.f); > > + __m128 k = _mm_min_ss (i, j); > > + __m128 l = _mm_max_ss (j, i); > > + __m512 ce = _mm512_setr_ps (-0.f, 2.f, -0.f, 0.f, -0.f, 0.f, 5.f, 5.f, > > 7.f, > > + 7.f, 9.f, -9.f, 11.f, -__builtin_inff (), > > + -0.f, -42.f); > > + __m512 de = _mm512_setr_ps (4.f, 2.f, 6.f, 7.f, 8.f, 0.f, 5.f, 5.f, 1.f, > > + 2.f, 3.f, -9.f, 5.f, 6.f, -0.f, 8.f); > > + __m512 ee = _mm512_setr_ps (-0.f, 5.f, 6.f, 0.f, -0.f, 0.f, 5.f, 0.f, > > 7.f, > > + 2.f, 9.f, 4.f, 11.f, 6.f, -0.f, -42.f); > > + __m512 fe = _mm512_setr_ps (1.f, 3.f, -0.f, 0.f, -0.f, 0.f, 5.f, 6.f, > > 8.f, > > + 8.f, 10.f, 10.f, 12.f, 0.f, __builtin_inff (), > > + 42.f); > > + __m512 ge = _mm512_setr_ps (4.f, 3.f, 6.f, 7.f, 8.f, 0.f, 5.f, 6.f, 1.f, > > + 2.f, 3.f, 10.f, 5.f, 6.f, __builtin_inff (), > > + 8.f); > > + __m512 he = _mm512_setr_ps (1.f, 5.f, 6.f, 0.f, -0.f, 0.f, 5.f, 0.f, 8.f, > > + 2.f, 10.f, 4.f, 12.f, 6.f, __builtin_inff (), > > + 42.f); > > + __m128 ke = _mm_setr_ps (-0.f, 2.f, 0.f, -0.f); > > + __m128 le = _mm_setr_ps (1.f, 3.f, -0.f, 0.f); > > + if (__builtin_memcmp (&c, &ce, sizeof (c)) > > + || __builtin_memcmp (&d, &de, sizeof (d)) > > + || __builtin_memcmp (&e, &ee, sizeof (e)) > > + || __builtin_memcmp (&f, &fe, sizeof (f)) > > + || __builtin_memcmp (&g, &ge, sizeof (g)) > > + || __builtin_memcmp (&h, &he, sizeof (h)) > > + || __builtin_memcmp (&k, &ke, sizeof (k)) > > + || __builtin_memcmp (&l, &le, sizeof (l))) > > + __builtin_abort (); > > +} > > --- gcc/testsuite/gcc.target/i386/avx512f-pr116738-2.c.jj 2024-09-25 > > 10:19:40.925513841 +0200 > > +++ gcc/testsuite/gcc.target/i386/avx512f-pr116738-2.c 2024-09-25 > > 10:19:40.925513841 +0200 > > @@ -0,0 +1,15 @@ > > +/* PR target/116738 */ > > +/* { dg-do run } */ > > +/* { dg-options "-O2 -mavx512f" } */ > > +/* { dg-require-effective-target avx512f } */ > > + > > +#define AVX512F > > +#include "avx512f-helper.h" > > + > > +#include "avx512f-pr116738-1.c" > > + > > +void > > +TEST (void) > > +{ > > + test_pr116738 (); > > +} > > > > Jakub > > > > > -- > BR, > Hongtao