On Fri, Dec 15, 2017 at 1:29 AM, Richard Sandiford <richard.sandif...@linaro.org> wrote: > This patch just adds VEC_DUPLICATE_EXPR, since the VEC_DUPLICATE_CST > isn't needed with the new VECTOR_CST layout. It's really just the > original patch with bits removed, but just in case: > > Tested on aarch64-linux-gnu, x86_64-linux-gnu and powerpc64-linux-gnu. > OK to install?
To keep things simple at this point OK. Note that I'd eventually like to see this as VEC_PERM_EXPR <scalar_type_1, scalar_type_1, { 0, ... }>. For reductions when we need { x, 0, ... } we now have to use a VEC_DUPLICATE_EXPR to make x a vector and then a VEC_PERM_EXPR to merge it with {0, ... }, right? Rather than VEC_PERM_EXPR <x_1, 0, { 0, 1, 1, 1.... }> Thanks, Richard. > Richard > > > 2017-12-15 Richard Sandiford <richard.sandif...@linaro.org> > Alan Hayward <alan.hawy...@arm.com> > David Sherwood <david.sherw...@arm.com> > > gcc/ > * doc/generic.texi (VEC_DUPLICATE_EXPR): Document. > (VEC_COND_EXPR): Add missing @tindex. > * doc/md.texi (vec_duplicate@var{m}): Document. > * tree.def (VEC_DUPLICATE_EXPR): New tree codes. > * tree.c (build_vector_from_val): Add stubbed-out handling of > variable-length vectors, using VEC_DUPLICATE_EXPR. > (uniform_vector_p): Handle VEC_DUPLICATE_EXPR. > * cfgexpand.c (expand_debug_expr): Likewise. > * tree-cfg.c (verify_gimple_assign_unary): Likewise. > * tree-inline.c (estimate_operator_cost): Likewise. > * tree-pretty-print.c (dump_generic_node): Likewise. > * tree-vect-generic.c (ssa_uniform_vector_p): Likewise. > * fold-const.c (const_unop): Fold VEC_DUPLICATE_EXPRs of a constant. > (test_vec_duplicate_folding): New function. > (fold_const_c_tests): Call it. > * optabs.def (vec_duplicate_optab): New optab. > * optabs-tree.c (optab_for_tree_code): Handle VEC_DUPLICATE_EXPR. > * optabs.h (expand_vector_broadcast): Declare. > * optabs.c (expand_vector_broadcast): Make non-static. Try using > vec_duplicate_optab. > * expr.c (store_constructor): Try using vec_duplicate_optab for > uniform vectors. > (expand_expr_real_2): Handle VEC_DUPLICATE_EXPR. > > Index: gcc/doc/generic.texi > =================================================================== > --- gcc/doc/generic.texi 2017-12-15 00:24:47.213516622 +0000 > +++ gcc/doc/generic.texi 2017-12-15 00:24:47.498459276 +0000 > @@ -1768,6 +1768,7 @@ a value from @code{enum annot_expr_kind} > > @node Vectors > @subsection Vectors > +@tindex VEC_DUPLICATE_EXPR > @tindex VEC_LSHIFT_EXPR > @tindex VEC_RSHIFT_EXPR > @tindex VEC_WIDEN_MULT_HI_EXPR > @@ -1779,9 +1780,14 @@ a value from @code{enum annot_expr_kind} > @tindex VEC_PACK_TRUNC_EXPR > @tindex VEC_PACK_SAT_EXPR > @tindex VEC_PACK_FIX_TRUNC_EXPR > +@tindex VEC_COND_EXPR > @tindex SAD_EXPR > > @table @code > +@item VEC_DUPLICATE_EXPR > +This node has a single operand and represents a vector in which every > +element is equal to that operand. > + > @item VEC_LSHIFT_EXPR > @itemx VEC_RSHIFT_EXPR > These nodes represent whole vector left and right shifts, respectively. > Index: gcc/doc/md.texi > =================================================================== > --- gcc/doc/md.texi 2017-12-15 00:24:47.213516622 +0000 > +++ gcc/doc/md.texi 2017-12-15 00:24:47.499459075 +0000 > @@ -4888,6 +4888,17 @@ and operand 1 is parallel containing val > the vector mode @var{m}, or a vector mode with the same element mode and > smaller number of elements. > > +@cindex @code{vec_duplicate@var{m}} instruction pattern > +@item @samp{vec_duplicate@var{m}} > +Initialize vector output operand 0 so that each element has the value given > +by scalar input operand 1. The vector has mode @var{m} and the scalar has > +the mode appropriate for one element of @var{m}. > + > +This pattern only handles duplicates of non-constant inputs. Constant > +vectors go through the @code{mov@var{m}} pattern instead. > + > +This pattern is not allowed to @code{FAIL}. > + > @cindex @code{vec_cmp@var{m}@var{n}} instruction pattern > @item @samp{vec_cmp@var{m}@var{n}} > Output a vector comparison. Operand 0 of mode @var{n} is the destination for > Index: gcc/tree.def > =================================================================== > --- gcc/tree.def 2017-12-15 00:24:47.213516622 +0000 > +++ gcc/tree.def 2017-12-15 00:24:47.505457868 +0000 > @@ -537,6 +537,9 @@ DEFTREECODE (TARGET_EXPR, "target_expr", > 1 and 2 are NULL. The operands are then taken from the cfg edges. */ > DEFTREECODE (COND_EXPR, "cond_expr", tcc_expression, 3) > > +/* Represents a vector in which every element is equal to operand 0. */ > +DEFTREECODE (VEC_DUPLICATE_EXPR, "vec_duplicate_expr", tcc_unary, 1) > + > /* Vector conditional expression. It is like COND_EXPR, but with > vector operands. > > Index: gcc/tree.c > =================================================================== > --- gcc/tree.c 2017-12-15 00:24:47.213516622 +0000 > +++ gcc/tree.c 2017-12-15 00:24:47.505457868 +0000 > @@ -1785,6 +1785,8 @@ build_vector_from_val (tree vectype, tre > v.quick_push (sc); > return v.build (); > } > + else if (0) > + return fold_build1 (VEC_DUPLICATE_EXPR, vectype, sc); > else > { > vec<constructor_elt, va_gc> *v; > @@ -10468,7 +10470,10 @@ uniform_vector_p (const_tree vec) > > gcc_assert (VECTOR_TYPE_P (TREE_TYPE (vec))); > > - if (TREE_CODE (vec) == VECTOR_CST) > + if (TREE_CODE (vec) == VEC_DUPLICATE_EXPR) > + return TREE_OPERAND (vec, 0); > + > + else if (TREE_CODE (vec) == VECTOR_CST) > { > if (VECTOR_CST_NPATTERNS (vec) == 1 && VECTOR_CST_DUPLICATE_P (vec)) > return VECTOR_CST_ENCODED_ELT (vec, 0); > Index: gcc/cfgexpand.c > =================================================================== > --- gcc/cfgexpand.c 2017-12-15 00:24:47.213516622 +0000 > +++ gcc/cfgexpand.c 2017-12-15 00:24:47.498459276 +0000 > @@ -5069,6 +5069,7 @@ expand_debug_expr (tree exp) > case VEC_WIDEN_LSHIFT_HI_EXPR: > case VEC_WIDEN_LSHIFT_LO_EXPR: > case VEC_PERM_EXPR: > + case VEC_DUPLICATE_EXPR: > return NULL; > > /* Misc codes. */ > Index: gcc/tree-cfg.c > =================================================================== > --- gcc/tree-cfg.c 2017-12-15 00:24:47.213516622 +0000 > +++ gcc/tree-cfg.c 2017-12-15 00:24:47.503458270 +0000 > @@ -3857,6 +3857,17 @@ verify_gimple_assign_unary (gassign *stm > case CONJ_EXPR: > break; > > + case VEC_DUPLICATE_EXPR: > + if (TREE_CODE (lhs_type) != VECTOR_TYPE > + || !useless_type_conversion_p (TREE_TYPE (lhs_type), rhs1_type)) > + { > + error ("vec_duplicate should be from a scalar to a like vector"); > + debug_generic_expr (lhs_type); > + debug_generic_expr (rhs1_type); > + return true; > + } > + return false; > + > default: > gcc_unreachable (); > } > Index: gcc/tree-inline.c > =================================================================== > --- gcc/tree-inline.c 2017-12-15 00:24:47.213516622 +0000 > +++ gcc/tree-inline.c 2017-12-15 00:24:47.504458069 +0000 > @@ -3928,6 +3928,7 @@ estimate_operator_cost (enum tree_code c > case VEC_PACK_FIX_TRUNC_EXPR: > case VEC_WIDEN_LSHIFT_HI_EXPR: > case VEC_WIDEN_LSHIFT_LO_EXPR: > + case VEC_DUPLICATE_EXPR: > > return 1; > > Index: gcc/tree-pretty-print.c > =================================================================== > --- gcc/tree-pretty-print.c 2017-12-15 00:24:47.213516622 +0000 > +++ gcc/tree-pretty-print.c 2017-12-15 00:24:47.504458069 +0000 > @@ -3178,6 +3178,15 @@ dump_generic_node (pretty_printer *pp, t > pp_string (pp, " > "); > break; > > + case VEC_DUPLICATE_EXPR: > + pp_space (pp); > + for (str = get_tree_code_name (code); *str; str++) > + pp_character (pp, TOUPPER (*str)); > + pp_string (pp, " < "); > + dump_generic_node (pp, TREE_OPERAND (node, 0), spc, flags, false); > + pp_string (pp, " > "); > + break; > + > case VEC_UNPACK_HI_EXPR: > pp_string (pp, " VEC_UNPACK_HI_EXPR < "); > dump_generic_node (pp, TREE_OPERAND (node, 0), spc, flags, false); > Index: gcc/tree-vect-generic.c > =================================================================== > --- gcc/tree-vect-generic.c 2017-12-15 00:24:47.213516622 +0000 > +++ gcc/tree-vect-generic.c 2017-12-15 00:24:47.504458069 +0000 > @@ -1418,6 +1418,7 @@ lower_vec_perm (gimple_stmt_iterator *gs > ssa_uniform_vector_p (tree op) > { > if (TREE_CODE (op) == VECTOR_CST > + || TREE_CODE (op) == VEC_DUPLICATE_EXPR > || TREE_CODE (op) == CONSTRUCTOR) > return uniform_vector_p (op); > if (TREE_CODE (op) == SSA_NAME) > Index: gcc/fold-const.c > =================================================================== > --- gcc/fold-const.c 2017-12-15 00:24:47.213516622 +0000 > +++ gcc/fold-const.c 2017-12-15 00:24:47.501458673 +0000 > @@ -1771,6 +1771,11 @@ const_unop (enum tree_code code, tree ty > return elts.build (); > } > > + case VEC_DUPLICATE_EXPR: > + if (CONSTANT_CLASS_P (arg0)) > + return build_vector_from_val (type, arg0); > + return NULL_TREE; > + > default: > break; > } > @@ -14442,6 +14447,22 @@ test_vector_folding () > ASSERT_FALSE (integer_nonzerop (fold_build2 (NE_EXPR, res_type, one, > one))); > } > > +/* Verify folding of VEC_DUPLICATE_EXPRs. */ > + > +static void > +test_vec_duplicate_folding () > +{ > + scalar_int_mode int_mode = SCALAR_INT_TYPE_MODE (ssizetype); > + machine_mode vec_mode = targetm.vectorize.preferred_simd_mode (int_mode); > + /* This will be 1 if VEC_MODE isn't a vector mode. */ > + unsigned int nunits = GET_MODE_NUNITS (vec_mode); > + > + tree type = build_vector_type (ssizetype, nunits); > + tree dup5_expr = fold_unary (VEC_DUPLICATE_EXPR, type, ssize_int (5)); > + tree dup5_cst = build_vector_from_val (type, ssize_int (5)); > + ASSERT_TRUE (operand_equal_p (dup5_expr, dup5_cst, 0)); > +} > + > /* Run all of the selftests within this file. */ > > void > @@ -14449,6 +14470,7 @@ fold_const_c_tests () > { > test_arithmetic_folding (); > test_vector_folding (); > + test_vec_duplicate_folding (); > } > > } // namespace selftest > Index: gcc/optabs.def > =================================================================== > --- gcc/optabs.def 2017-12-15 00:24:47.213516622 +0000 > +++ gcc/optabs.def 2017-12-15 00:24:47.502458472 +0000 > @@ -363,3 +363,5 @@ OPTAB_D (atomic_xor_optab, "atomic_xor$I > > OPTAB_D (get_thread_pointer_optab, "get_thread_pointer$I$a") > OPTAB_D (set_thread_pointer_optab, "set_thread_pointer$I$a") > + > +OPTAB_DC (vec_duplicate_optab, "vec_duplicate$a", VEC_DUPLICATE) > Index: gcc/optabs-tree.c > =================================================================== > --- gcc/optabs-tree.c 2017-12-15 00:24:47.213516622 +0000 > +++ gcc/optabs-tree.c 2017-12-15 00:24:47.501458673 +0000 > @@ -199,6 +199,9 @@ optab_for_tree_code (enum tree_code code > return TYPE_UNSIGNED (type) ? > vec_pack_ufix_trunc_optab : vec_pack_sfix_trunc_optab; > > + case VEC_DUPLICATE_EXPR: > + return vec_duplicate_optab; > + > default: > break; > } > Index: gcc/optabs.h > =================================================================== > --- gcc/optabs.h 2017-12-15 00:24:47.213516622 +0000 > +++ gcc/optabs.h 2017-12-15 00:24:47.502458472 +0000 > @@ -182,6 +182,7 @@ extern rtx simplify_expand_binop (machin > enum optab_methods methods); > extern bool force_expand_binop (machine_mode, optab, rtx, rtx, rtx, int, > enum optab_methods); > +extern rtx expand_vector_broadcast (machine_mode, rtx); > > /* Generate code for a simple binary or unary operation. "Simple" in > this case means "can be unambiguously described by a (mode, code) > Index: gcc/optabs.c > =================================================================== > --- gcc/optabs.c 2017-12-15 00:24:47.213516622 +0000 > +++ gcc/optabs.c 2017-12-15 00:24:47.502458472 +0000 > @@ -367,7 +367,7 @@ force_expand_binop (machine_mode mode, o > mode of OP must be the element mode of VMODE. If OP is a constant, > then the return value will be a constant. */ > > -static rtx > +rtx > expand_vector_broadcast (machine_mode vmode, rtx op) > { > enum insn_code icode; > @@ -380,6 +380,16 @@ expand_vector_broadcast (machine_mode vm > if (valid_for_const_vec_duplicate_p (vmode, op)) > return gen_const_vec_duplicate (vmode, op); > > + icode = optab_handler (vec_duplicate_optab, vmode); > + if (icode != CODE_FOR_nothing) > + { > + struct expand_operand ops[2]; > + create_output_operand (&ops[0], NULL_RTX, vmode); > + create_input_operand (&ops[1], op, GET_MODE (op)); > + expand_insn (icode, 2, ops); > + return ops[0].value; > + } > + > /* ??? If the target doesn't have a vec_init, then we have no easy way > of performing this operation. Most of this sort of generic support > is hidden away in the vector lowering support in gimple. */ > Index: gcc/expr.c > =================================================================== > --- gcc/expr.c 2017-12-15 00:24:47.213516622 +0000 > +++ gcc/expr.c 2017-12-15 00:24:47.500458874 +0000 > @@ -6598,7 +6598,8 @@ store_constructor (tree exp, rtx target, > constructor_elt *ce; > int i; > int need_to_clear; > - int icode = CODE_FOR_nothing; > + insn_code icode = CODE_FOR_nothing; > + tree elt; > tree elttype = TREE_TYPE (type); > int elt_size = tree_to_uhwi (TYPE_SIZE (elttype)); > machine_mode eltmode = TYPE_MODE (elttype); > @@ -6608,13 +6609,30 @@ store_constructor (tree exp, rtx target, > unsigned n_elts; > alias_set_type alias; > bool vec_vec_init_p = false; > + machine_mode mode = GET_MODE (target); > > gcc_assert (eltmode != BLKmode); > > + /* Try using vec_duplicate_optab for uniform vectors. */ > + if (!TREE_SIDE_EFFECTS (exp) > + && VECTOR_MODE_P (mode) > + && eltmode == GET_MODE_INNER (mode) > + && ((icode = optab_handler (vec_duplicate_optab, mode)) > + != CODE_FOR_nothing) > + && (elt = uniform_vector_p (exp))) > + { > + struct expand_operand ops[2]; > + create_output_operand (&ops[0], target, mode); > + create_input_operand (&ops[1], expand_normal (elt), eltmode); > + expand_insn (icode, 2, ops); > + if (!rtx_equal_p (target, ops[0].value)) > + emit_move_insn (target, ops[0].value); > + break; > + } > + > n_elts = TYPE_VECTOR_SUBPARTS (type); > - if (REG_P (target) && VECTOR_MODE_P (GET_MODE (target))) > + if (REG_P (target) && VECTOR_MODE_P (mode)) > { > - machine_mode mode = GET_MODE (target); > machine_mode emode = eltmode; > > if (CONSTRUCTOR_NELTS (exp) > @@ -6626,7 +6644,7 @@ store_constructor (tree exp, rtx target, > == n_elts); > emode = TYPE_MODE (etype); > } > - icode = (int) convert_optab_handler (vec_init_optab, mode, emode); > + icode = convert_optab_handler (vec_init_optab, mode, emode); > if (icode != CODE_FOR_nothing) > { > unsigned int i, n = n_elts; > @@ -6674,7 +6692,7 @@ store_constructor (tree exp, rtx target, > if (need_to_clear && size > 0 && !vector) > { > if (REG_P (target)) > - emit_move_insn (target, CONST0_RTX (GET_MODE (target))); > + emit_move_insn (target, CONST0_RTX (mode)); > else > clear_storage (target, GEN_INT (size), BLOCK_OP_NORMAL); > cleared = 1; > @@ -6682,7 +6700,7 @@ store_constructor (tree exp, rtx target, > > /* Inform later passes that the old value is dead. */ > if (!cleared && !vector && REG_P (target)) > - emit_move_insn (target, CONST0_RTX (GET_MODE (target))); > + emit_move_insn (target, CONST0_RTX (mode)); > > if (MEM_P (target)) > alias = MEM_ALIAS_SET (target); > @@ -6733,8 +6751,7 @@ store_constructor (tree exp, rtx target, > > if (vector) > emit_insn (GEN_FCN (icode) (target, > - gen_rtx_PARALLEL (GET_MODE (target), > - vector))); > + gen_rtx_PARALLEL (mode, vector))); > break; > } > > @@ -9563,6 +9580,12 @@ #define REDUCE_BIT_FIELD(expr) (reduce_b > target = expand_vec_cond_expr (type, treeop0, treeop1, treeop2, > target); > return target; > > + case VEC_DUPLICATE_EXPR: > + op0 = expand_expr (treeop0, NULL_RTX, VOIDmode, modifier); > + target = expand_vector_broadcast (mode, op0); > + gcc_assert (target); > + return target; > + > case BIT_INSERT_EXPR: > { > unsigned bitpos = tree_to_uhwi (treeop2);