On February 23, 2019 1:27:46 AM GMT+01:00, Jakub Jelinek <ja...@redhat.com> wrote: >Hi! > >The following testcase is miscompiled on x86_64. The problem is that >simplify_merge_mask optimization throws away an inner VEC_MERGE when >there >is an outer one with the same mask. This can be done only if the >change >doesn't have observable side-effects. The code already uses >side_effects_p >tests in various spots, that is needed, but as this testcase shows, not >sufficient. Another issue is if there is a MEM load or store and not >MEM_NOTRAP_P, as the testcase shows. And another problem can be vector >integer division by zero (I think only mips has such insn), or various >floating point operations if we care about floating point exceptions. > >While fixing this, I've found that may_trap_p_1 doesn't really support >vector operations very much, vector floating point arithmetics can >cause >exceptions like scalar floating point arithmetics; on the other side, >the >4 VEC_* codes can't trap themselves, though their operands could. > >Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?
OK. Richard. >2019-02-23 Jakub Jelinek <ja...@redhat.com> > > PR rtl-optimization/89445 > * simplify-rtx.c (simplify_ternary_operation): Don't use > simplify_merge_mask on operands that may trap. > * rtlanal.c (may_trap_p_1): Use FLOAT_MODE_P instead of > SCALAR_FLOAT_MODE_P checks. For integral division by zero, if > second operand is CONST_VECTOR, check if any element could be zero. > Don't expect traps for VEC_{MERGE,SELECT,CONCAT,DUPLICATE} unless > their operands can trap. > > * gcc.target/i386/avx512f-pr89445.c: New test. > >--- gcc/simplify-rtx.c.jj 2019-01-10 11:43:14.390377646 +0100 >+++ gcc/simplify-rtx.c 2019-02-22 19:01:08.977661098 +0100 >@@ -6073,8 +6073,10 @@ simplify_ternary_operation (enum rtx_cod > > if (!side_effects_p (op2)) > { >- rtx top0 = simplify_merge_mask (op0, op2, 0); >- rtx top1 = simplify_merge_mask (op1, op2, 1); >+ rtx top0 >+ = may_trap_p (op0) ? NULL_RTX : simplify_merge_mask (op0, op2, >0); >+ rtx top1 >+ = may_trap_p (op1) ? NULL_RTX : simplify_merge_mask (op1, op2, >1); > if (top0 || top1) > return simplify_gen_ternary (code, mode, mode, > top0 ? top0 : op0, >--- gcc/rtlanal.c.jj 2019-02-20 10:00:49.279492877 +0100 >+++ gcc/rtlanal.c 2019-02-22 19:03:02.478790634 +0100 >@@ -2846,10 +2846,28 @@ may_trap_p_1 (const_rtx x, unsigned flag > case UMOD: > if (HONOR_SNANS (x)) > return 1; >- if (SCALAR_FLOAT_MODE_P (GET_MODE (x))) >+ if (FLOAT_MODE_P (GET_MODE (x))) > return flag_trapping_math; > if (!CONSTANT_P (XEXP (x, 1)) || (XEXP (x, 1) == const0_rtx)) > return 1; >+ if (GET_CODE (XEXP (x, 1)) == CONST_VECTOR) >+ { >+ /* For CONST_VECTOR, return 1 if any element is or might be zero. >*/ >+ unsigned int n_elts; >+ rtx op = XEXP (x, 1); >+ if (!GET_MODE_NUNITS (GET_MODE (op)).is_constant (&n_elts)) >+ { >+ if (!CONST_VECTOR_DUPLICATE_P (op)) >+ return 1; >+ for (unsigned i = 0; i < (unsigned int) XVECLEN (op, 0); i++) >+ if (CONST_VECTOR_ENCODED_ELT (op, i) == const0_rtx) >+ return 1; >+ } >+ else >+ for (unsigned i = 0; i < n_elts; i++) >+ if (CONST_VECTOR_ELT (op, i) == const0_rtx) >+ return 1; >+ } > break; > > case EXPR_LIST: >@@ -2898,12 +2916,16 @@ may_trap_p_1 (const_rtx x, unsigned flag > case NEG: > case ABS: > case SUBREG: >+ case VEC_MERGE: >+ case VEC_SELECT: >+ case VEC_CONCAT: >+ case VEC_DUPLICATE: > /* These operations don't trap even with floating point. */ > break; > > default: > /* Any floating arithmetic may trap. */ >- if (SCALAR_FLOAT_MODE_P (GET_MODE (x)) && flag_trapping_math) >+ if (FLOAT_MODE_P (GET_MODE (x)) && flag_trapping_math) > return 1; > } > >--- gcc/testsuite/gcc.target/i386/avx512f-pr89445.c.jj 2019-02-22 >19:19:17.709898754 +0100 >+++ gcc/testsuite/gcc.target/i386/avx512f-pr89445.c 2019-02-22 >19:18:58.115216531 +0100 >@@ -0,0 +1,54 @@ >+/* PR rtl-optimization/89445 */ >+/* { dg-do run { target { avx512f && mmap } } } */ >+/* { dg-options "-O2 -mavx512f" } */ >+ >+#include "avx512f-check.h" >+ >+#include <sys/mman.h> >+#ifndef MAP_ANONYMOUS >+#define MAP_ANONYMOUS MAP_ANON >+#endif >+#ifndef MAP_ANON >+#define MAP_ANON 0 >+#endif >+#ifndef MAP_FAILED >+#define MAP_FAILED ((void *)-1) >+#endif >+ >+__attribute__ ((noipa)) >+void daxpy (unsigned long n, double a, double const *__restrict x, >+ double *__restrict y) >+{ >+ const __m512d v_a = _mm512_broadcastsd_pd (_mm_set_sd (a)); >+ const __mmask16 final = (1U << (n % 8u)) - 1; >+ __mmask16 mask = 65535u; >+ unsigned long i; >+ for (i = 0; i < n * sizeof (double); i += 8 * sizeof (double)) >+ { >+ if (i + 8 * sizeof (double) > n * sizeof (double)) >+ mask = final; >+ __m512d v_x = _mm512_maskz_loadu_pd (mask, (char const *) x + >i); >+ __m512d v_y = _mm512_maskz_loadu_pd (mask, (char const *) y + >i); >+ __m512d tmp = _mm512_fmadd_pd (v_x, v_a, v_y); >+ _mm512_mask_storeu_pd ((char *) y + i, mask, tmp); >+ } >+} >+ >+static const double x[] = { 1, 2, 3, 4 }; >+ >+static void >+avx512f_test (void) >+{ >+ char *ptr >+ = (char *) mmap (NULL, 2 * 4096, PROT_READ | PROT_WRITE, >+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); >+ if (ptr == MAP_FAILED) >+ return; >+ >+ munmap (ptr + 4096, 4096); >+ double *y = (double *) (ptr + 4096 - sizeof (x)); >+ __builtin_memcpy (y, x, sizeof (x)); >+ daxpy (sizeof (x) / sizeof (x[0]), 1.0, x, y); >+ if (y[0] != 2.0 || y[1] != 4.0 || y[2] != 6.0 || y[3] != 8.0) >+ abort (); >+} > > Jakub