On Mon, May 27, 2024 at 2:48 PM Hongtao Liu <crazy...@gmail.com> wrote:
>
> On Sat, May 18, 2024 at 4:10 AM Roger Sayle <ro...@nextmovesoftware.com> 
> wrote:
> >
> >
> > Hi Hongtao,
> > Many thanks for the review, bug fixes and suggestions for improvements.
> > This revised version of the patch, implements all of your corrections.  In 
> > theory
> > the "ternlog idx" should guarantee that some operands are non-null, but I 
> > agree
> > that it's better defensive programming to check invariants not easily 
> > proved.
> > Instead of calling ix86_expand_vector_move, I use 
> > ix86_broadcast_from_constant
> > to achieve the same effect of using a broadcast when possible, but has the 
> > benefit
> > of still using a memory operand (instead of a vector load) when 
> > broadcasting isn't
> > possible.  There are other places that could benefit from the same trick, 
> > but I can
> > address these in a follow-up patch (it may even be preferrable to keep 
> > these as
> > CONST_VECTOR during early RTL passes and lower to broadcast or constant pool
> > using splitters).
> >
> > This revised patch has been tested on x86_64-pc-linux-gnu with make 
> > bootstrap
> > and make -k check, both with and without --target_board=unix{-m32}
> > with no new failures.  Ok for mainline?
> 1 file changed, 41 insertions(+)
> gcc/config/i386/i386-expand.cc | 41 +++++++++++++++++++++++++++++++++++++++++
>
> modified   gcc/config/i386/i386-expand.cc
> @@ -25579,14 +25579,22 @@ ix86_gen_bcst_mem (machine_mode mode, rtx x)
>        && !CONST_DOUBLE_P (cst)
>        && !CONST_FIXED_P (cst))
>      return NULL_RTX;
> +  /* I think VALID_BCST_MODE_P should be sufficient to
> +     make sure cst is CONST_INT or CONST_DOUBLE.  */
>
>    int n_elts = GET_MODE_NUNITS (mode);
>    if (CONST_VECTOR_NUNITS (x) != n_elts)
>      return NULL_RTX;
> +  /* Do we need this? I saw from caller side there's already
> +       if (GET_MODE (op2) != mode)
> + op2 = gen_lowpart (mode, op2);
> + tmp2 = ix86_gen_bcst_mem (mode, op2);  */
> +
>
>    for (int i = 1; i < n_elts; i++)
>      if (!rtx_equal_p (cst, CONST_VECTOR_ELT (x, i)))
>        return NULL_RTX;
> +  /* CONST_VECTOR_DUPLICATE_P (op)? */
>
>    rtx mem = force_const_mem (GET_MODE_INNER (mode), cst);
>    return gen_rtx_VEC_DUPLICATE (mode, validize_mem (mem));
> @@ -25709,6 +25717,21 @@ ix86_ternlog_idx (rtx op, rtx *args)
>     || ix86_ternlog_idx (XVECEXP (op, 0, 2), args) != 0xaa)
>   return -1;
>        return INTVAL (XVECEXP (op, 0, 3));
> +      /* I think we can add some testcase for this.
> + .i.e
> + #include <immintrin.h>
> +
> + __m256i
> + foo (__m256i a, __m256i b, __m256i c)
> + {
> + return (a & _mm256_ternarylogic_epi64 (a, b, c, 0xe4));
> + }
> +
> + __m256i
> + foo1 (__m256i a, __m256i b, __m256i c)
> + {
> + return (b & _mm256_ternarylogic_epi64 (a, b, c, 0xe4));
> + }  */
>
>      default:
>        return -1;
> @@ -25778,6 +25801,8 @@ ix86_ternlog_operand_p (rtx op)
>        if (ix86_ternlog_leaf_p (XEXP (op, 0), mode)
>     && (ix86_ternlog_leaf_p (op1, mode)
>         || vector_all_ones_operand (op1, mode)))
> + /* There's CONST_VECTOR check in x86_ternlog_leaf_p,
> +    so vector_all_ones_operand is not needed.  */
>   return false;
>        break;
>
> @@ -25862,6 +25887,10 @@ ix86_expand_ternlog (machine_mode mode, rtx
> op0, rtx op1, rtx op2, int idx,
>        if ((!op0 || !side_effects_p (op0))
>            && (!op1 || !side_effects_p (op1))
>            && (!op2 || !side_effects_p (op2)))
> + /* I think only op2 needs to check side_effects_p, op0
> +    and op1 must be register operand when it exists, no need for
> side_effects_p?
> +    Similar for all below side_effects_p (op0/op1)
> +    the check is redundant.  */
>          {
>     emit_move_insn (target, CONST0_RTX (mode));
>     return target;
> @@ -25872,6 +25901,9 @@ ix86_expand_ternlog (machine_mode mode, rtx
> op0, rtx op1, rtx op2, int idx,
>        if ((!op1 || !side_effects_p (op1))
>     && op0 && register_operand (op0, mode)
>     && op2 && register_operand (op2, mode))
> + /* op0/op1 must be register_operand when it exists,
> +    so register_operand (op0/op1, mode) is not needed.
> +    similar for all below register_operand (op0/op1, mode).  */
>   return ix86_expand_ternlog_andnot (mode, op0, op2, target);
>        break;
>
> @@ -25879,6 +25911,7 @@ ix86_expand_ternlog (machine_mode mode, rtx
> op0, rtx op1, rtx op2, int idx,
>        if ((!op2 || !side_effects_p (op2))
>     && op0 && register_operand (op0, mode)
>     && op1 && register_operand (op1, mode))
> + /* op0 && op1? */
>   return ix86_expand_ternlog_andnot (mode, op0, op1, target);
>        break;
>
> @@ -25948,6 +25981,7 @@ ix86_expand_ternlog (machine_mode mode, rtx
> op0, rtx op1, rtx op2, int idx,
>        if ((!op0 || !side_effects_p (op0))
>     && (!op1 || !side_effects_p (op1))
>            && op2)
> + /* if (op2).  */
>   {
>     if (GET_MODE (op2) != mode)
>       op2 = gen_lowpart (mode, op2);
> @@ -25961,18 +25995,21 @@ ix86_expand_ternlog (machine_mode mode, rtx
> op0, rtx op1, rtx op2, int idx,
>      case 0x5a:  /* a^c */
>        if (op0 && op2
>            && (!op1 || !side_effects_p (op1)))
> + /* if (op0 && op2).  */
>   return ix86_expand_ternlog_binop (XOR, mode, op0, op2, target);
>        break;
>
>      case 0x66:  /* b^c */
>        if ((!op0 || !side_effects_p (op0))
>            && op1 && op2)
> + /* if (op1 && op2).  */
>   return ix86_expand_ternlog_binop (XOR, mode, op1, op2, target);
>        break;
>
>      case 0x88:  /* b&c */
>        if ((!op0 || !side_effects_p (op0))
>            && op1 && op2)
> + /* if (op1 && op2).  */
>   return ix86_expand_ternlog_binop (AND, mode, op1, op2, target);
>        break;
>
> @@ -26054,6 +26091,9 @@ ix86_expand_ternlog (machine_mode mode, rtx
> op0, rtx op1, rtx op2, int idx,
>      }
>
>    tmp0 = register_operand (op0, mode) ? op0 : force_reg (mode, op0);
> +  /* Do you observe there're cases of op0 not register_operand?.
> +     if it's from <avx512>_vternlog<mode>_mask, it must be register_operand.
> +     if it's from ix86_ternlog_idx, it must REG_P.  */
>    if (GET_MODE (tmp0) != mode)
>      tmp0 = gen_lowpart (mode, tmp0);
>
> @@ -26061,6 +26101,7 @@ ix86_expand_ternlog (machine_mode mode, rtx
> op0, rtx op1, rtx op2, int idx,
>      tmp1 = copy_rtx (tmp0);
>    else if (!register_operand (op1, mode))
>      tmp1 = force_reg (mode, op1);
> +  /* Ditto.  */
>    else
>      tmp1 = op1;
>    if (GET_MODE (tmp1) != mode)
>
>
>
>
> --
> BR,
> Hongtao

Got ICE for below testcase

#include <immintrin.h>
__m256i
foo2 (__m256i** a, __m256i b)
{
  return ~(**a);
}

with -march=x86-64-v4 -O2

 (insn 17 7 13 2 (set (reg:V4DI 103 [ _5 ])
        (xor:V4DI (mem:V4DI (mem/f:DI (reg:DI 105) [1 *a_4(D)+0 S8
A64]) [0 *_1+0 S32 A256])
            (const_vector:V4DI [
                    (const_int -1 [0xffffffffffffffff]) repeated x4
                ]))) "test.c":7:10 -1
     (expr_list:REG_DEAD (reg:DI 105)
        (nil)))
during RTL pass: ira

I think we need to check memory_operand in ix86_ternlog_idx

    case MEM:
      if (MEM_P (op)
  && MEM_VOLATILE_P (op)
  && !volatile_ok)
return -1;
      /* FALLTHRU */

-- 
BR,
Hongtao

Reply via email to