On Mon, May 27, 2024 at 2:48 PM Hongtao Liu <crazy...@gmail.com> wrote: > > On Sat, May 18, 2024 at 4:10 AM Roger Sayle <ro...@nextmovesoftware.com> > wrote: > > > > > > Hi Hongtao, > > Many thanks for the review, bug fixes and suggestions for improvements. > > This revised version of the patch, implements all of your corrections. In > > theory > > the "ternlog idx" should guarantee that some operands are non-null, but I > > agree > > that it's better defensive programming to check invariants not easily > > proved. > > Instead of calling ix86_expand_vector_move, I use > > ix86_broadcast_from_constant > > to achieve the same effect of using a broadcast when possible, but has the > > benefit > > of still using a memory operand (instead of a vector load) when > > broadcasting isn't > > possible. There are other places that could benefit from the same trick, > > but I can > > address these in a follow-up patch (it may even be preferrable to keep > > these as > > CONST_VECTOR during early RTL passes and lower to broadcast or constant pool > > using splitters). > > > > This revised patch has been tested on x86_64-pc-linux-gnu with make > > bootstrap > > and make -k check, both with and without --target_board=unix{-m32} > > with no new failures. Ok for mainline? > 1 file changed, 41 insertions(+) > gcc/config/i386/i386-expand.cc | 41 +++++++++++++++++++++++++++++++++++++++++ > > modified gcc/config/i386/i386-expand.cc > @@ -25579,14 +25579,22 @@ ix86_gen_bcst_mem (machine_mode mode, rtx x) > && !CONST_DOUBLE_P (cst) > && !CONST_FIXED_P (cst)) > return NULL_RTX; > + /* I think VALID_BCST_MODE_P should be sufficient to > + make sure cst is CONST_INT or CONST_DOUBLE. */ > > int n_elts = GET_MODE_NUNITS (mode); > if (CONST_VECTOR_NUNITS (x) != n_elts) > return NULL_RTX; > + /* Do we need this? I saw from caller side there's already > + if (GET_MODE (op2) != mode) > + op2 = gen_lowpart (mode, op2); > + tmp2 = ix86_gen_bcst_mem (mode, op2); */ > + > > for (int i = 1; i < n_elts; i++) > if (!rtx_equal_p (cst, CONST_VECTOR_ELT (x, i))) > return NULL_RTX; > + /* CONST_VECTOR_DUPLICATE_P (op)? */ > > rtx mem = force_const_mem (GET_MODE_INNER (mode), cst); > return gen_rtx_VEC_DUPLICATE (mode, validize_mem (mem)); > @@ -25709,6 +25717,21 @@ ix86_ternlog_idx (rtx op, rtx *args) > || ix86_ternlog_idx (XVECEXP (op, 0, 2), args) != 0xaa) > return -1; > return INTVAL (XVECEXP (op, 0, 3)); > + /* I think we can add some testcase for this. > + .i.e > + #include <immintrin.h> > + > + __m256i > + foo (__m256i a, __m256i b, __m256i c) > + { > + return (a & _mm256_ternarylogic_epi64 (a, b, c, 0xe4)); > + } > + > + __m256i > + foo1 (__m256i a, __m256i b, __m256i c) > + { > + return (b & _mm256_ternarylogic_epi64 (a, b, c, 0xe4)); > + } */ > > default: > return -1; > @@ -25778,6 +25801,8 @@ ix86_ternlog_operand_p (rtx op) > if (ix86_ternlog_leaf_p (XEXP (op, 0), mode) > && (ix86_ternlog_leaf_p (op1, mode) > || vector_all_ones_operand (op1, mode))) > + /* There's CONST_VECTOR check in x86_ternlog_leaf_p, > + so vector_all_ones_operand is not needed. */ > return false; > break; > > @@ -25862,6 +25887,10 @@ ix86_expand_ternlog (machine_mode mode, rtx > op0, rtx op1, rtx op2, int idx, > if ((!op0 || !side_effects_p (op0)) > && (!op1 || !side_effects_p (op1)) > && (!op2 || !side_effects_p (op2))) > + /* I think only op2 needs to check side_effects_p, op0 > + and op1 must be register operand when it exists, no need for > side_effects_p? > + Similar for all below side_effects_p (op0/op1) > + the check is redundant. */ > { > emit_move_insn (target, CONST0_RTX (mode)); > return target; > @@ -25872,6 +25901,9 @@ ix86_expand_ternlog (machine_mode mode, rtx > op0, rtx op1, rtx op2, int idx, > if ((!op1 || !side_effects_p (op1)) > && op0 && register_operand (op0, mode) > && op2 && register_operand (op2, mode)) > + /* op0/op1 must be register_operand when it exists, > + so register_operand (op0/op1, mode) is not needed. > + similar for all below register_operand (op0/op1, mode). */ > return ix86_expand_ternlog_andnot (mode, op0, op2, target); > break; > > @@ -25879,6 +25911,7 @@ ix86_expand_ternlog (machine_mode mode, rtx > op0, rtx op1, rtx op2, int idx, > if ((!op2 || !side_effects_p (op2)) > && op0 && register_operand (op0, mode) > && op1 && register_operand (op1, mode)) > + /* op0 && op1? */ > return ix86_expand_ternlog_andnot (mode, op0, op1, target); > break; > > @@ -25948,6 +25981,7 @@ ix86_expand_ternlog (machine_mode mode, rtx > op0, rtx op1, rtx op2, int idx, > if ((!op0 || !side_effects_p (op0)) > && (!op1 || !side_effects_p (op1)) > && op2) > + /* if (op2). */ > { > if (GET_MODE (op2) != mode) > op2 = gen_lowpart (mode, op2); > @@ -25961,18 +25995,21 @@ ix86_expand_ternlog (machine_mode mode, rtx > op0, rtx op1, rtx op2, int idx, > case 0x5a: /* a^c */ > if (op0 && op2 > && (!op1 || !side_effects_p (op1))) > + /* if (op0 && op2). */ > return ix86_expand_ternlog_binop (XOR, mode, op0, op2, target); > break; > > case 0x66: /* b^c */ > if ((!op0 || !side_effects_p (op0)) > && op1 && op2) > + /* if (op1 && op2). */ > return ix86_expand_ternlog_binop (XOR, mode, op1, op2, target); > break; > > case 0x88: /* b&c */ > if ((!op0 || !side_effects_p (op0)) > && op1 && op2) > + /* if (op1 && op2). */ > return ix86_expand_ternlog_binop (AND, mode, op1, op2, target); > break; > > @@ -26054,6 +26091,9 @@ ix86_expand_ternlog (machine_mode mode, rtx > op0, rtx op1, rtx op2, int idx, > } > > tmp0 = register_operand (op0, mode) ? op0 : force_reg (mode, op0); > + /* Do you observe there're cases of op0 not register_operand?. > + if it's from <avx512>_vternlog<mode>_mask, it must be register_operand. > + if it's from ix86_ternlog_idx, it must REG_P. */ > if (GET_MODE (tmp0) != mode) > tmp0 = gen_lowpart (mode, tmp0); > > @@ -26061,6 +26101,7 @@ ix86_expand_ternlog (machine_mode mode, rtx > op0, rtx op1, rtx op2, int idx, > tmp1 = copy_rtx (tmp0); > else if (!register_operand (op1, mode)) > tmp1 = force_reg (mode, op1); > + /* Ditto. */ > else > tmp1 = op1; > if (GET_MODE (tmp1) != mode) > > > > > -- > BR, > Hongtao
Got ICE for below testcase #include <immintrin.h> __m256i foo2 (__m256i** a, __m256i b) { return ~(**a); } with -march=x86-64-v4 -O2 (insn 17 7 13 2 (set (reg:V4DI 103 [ _5 ]) (xor:V4DI (mem:V4DI (mem/f:DI (reg:DI 105) [1 *a_4(D)+0 S8 A64]) [0 *_1+0 S32 A256]) (const_vector:V4DI [ (const_int -1 [0xffffffffffffffff]) repeated x4 ]))) "test.c":7:10 -1 (expr_list:REG_DEAD (reg:DI 105) (nil))) during RTL pass: ira I think we need to check memory_operand in ix86_ternlog_idx case MEM: if (MEM_P (op) && MEM_VOLATILE_P (op) && !volatile_ok) return -1; /* FALLTHRU */ -- BR, Hongtao