From: Richard Henderson <r...@twiddle.net> --- gcc/doc/md.texi | 6 ++ gcc/genopinit.c | 1 + gcc/optabs.c | 216 ++++++++++++++++++++++++++++++++++++----------- gcc/optabs.h | 12 ++- gcc/tree-vect-generic.c | 2 +- 5 files changed, 181 insertions(+), 56 deletions(-)
diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi index fe27210..68a5548 100644 --- a/gcc/doc/md.texi +++ b/gcc/doc/md.texi @@ -4041,6 +4041,12 @@ be computed modulo @math{2*@var{N}}. Note that if @code{rtx_equal_p(operand1, operand2)}, this can be implemented with just operand 1 and selector elements modulo @var{N}. +In order to make things easy for a number of targets, if there is no +@samp{vec_perm} pattern for mode @var{m}, but there is for mode @var{q} +where @var{q} is a vector of @code{QImode} of the same width as @var{m}, +the middle-end will lower the mode @var{m} @code{VEC_PERM_EXPR} to +mode @var{q}. + @cindex @code{vec_perm_const@var{m}} instruction pattern @item @samp{vec_perm_const@var{m}} Like @samp{vec_perm} except that the permutation is a compile-time diff --git a/gcc/genopinit.c b/gcc/genopinit.c index 4eefa03..d40e4c4 100644 --- a/gcc/genopinit.c +++ b/gcc/genopinit.c @@ -254,6 +254,7 @@ static const char * const optabs[] = "set_optab_handler (vec_shr_optab, $A, CODE_FOR_$(vec_shr_$a$))", "set_optab_handler (vec_realign_load_optab, $A, CODE_FOR_$(vec_realign_load_$a$))", "set_direct_optab_handler (vec_perm_optab, $A, CODE_FOR_$(vec_perm$a$))", + "set_direct_optab_handler (vec_perm_const_optab, $A, CODE_FOR_$(vec_perm_const$a$))", "set_convert_optab_handler (vcond_optab, $A, $B, CODE_FOR_$(vcond$a$b$))", "set_convert_optab_handler (vcondu_optab, $A, $B, CODE_FOR_$(vcondu$a$b$))", "set_optab_handler (ssum_widen_optab, $A, CODE_FOR_$(widen_ssum$I$a3$))", diff --git a/gcc/optabs.c b/gcc/optabs.c index e112467..e9a23f4 100644 --- a/gcc/optabs.c +++ b/gcc/optabs.c @@ -6687,87 +6687,203 @@ vector_compare_rtx (tree cond, bool unsignedp, enum insn_code icode) /* Return true if VEC_PERM_EXPR can be expanded using SIMD extensions of the CPU. */ + bool -expand_vec_perm_expr_p (enum machine_mode mode, tree v0, tree v1, tree mask) +can_vec_perm_expr_p (tree type, tree sel) { - int v0_mode_s = GET_MODE_BITSIZE (TYPE_MODE (TREE_TYPE (TREE_TYPE (v0)))); - int mask_mode_s = GET_MODE_BITSIZE (TYPE_MODE (TREE_TYPE (TREE_TYPE (mask)))); + enum machine_mode mode, qimode; + mode = TYPE_MODE (type); + + /* If the target doesn't implement a vector mode for the vector type, + then no operations are supported. */ + if (!VECTOR_MODE_P (mode)) + return false; + + if (TREE_CODE (sel) == VECTOR_CST) + { + if (direct_optab_handler (vec_perm_const_optab, mode) != CODE_FOR_nothing + && targetm.vectorize.builtin_vec_perm_ok (type, sel)) + return true; + } - if (TREE_CODE (mask) == VECTOR_CST - && targetm.vectorize.builtin_vec_perm_ok (TREE_TYPE (v0), mask)) + if (direct_optab_handler (vec_perm_optab, mode) != CODE_FOR_nothing) return true; - if (v0_mode_s != mask_mode_s - || TYPE_VECTOR_SUBPARTS (TREE_TYPE (v0)) - != TYPE_VECTOR_SUBPARTS (TREE_TYPE (mask)) - || TYPE_VECTOR_SUBPARTS (TREE_TYPE (v1)) - != TYPE_VECTOR_SUBPARTS (TREE_TYPE (mask))) + /* We allow fallback to a QI vector mode, and adjust the mask. */ + qimode = mode_for_vector (QImode, GET_MODE_SIZE (mode)); + if (!VECTOR_MODE_P (qimode)) return false; - return direct_optab_handler (vec_perm_optab, mode) != CODE_FOR_nothing; + /* ??? For completeness, we ought to check the QImode version of + vec_perm_const_optab. But all users of this implicit lowering + feature implement the variable vec_perm_optab. */ + if (direct_optab_handler (vec_perm_optab, qimode) == CODE_FOR_nothing) + return false; + + /* In order to support the lowering of non-constant permutations, + we need to support shifts and adds. */ + if (TREE_CODE (sel) != VECTOR_CST) + { + if (GET_MODE_UNIT_SIZE (mode) > 2 + && optab_handler (ashl_optab, mode) == CODE_FOR_nothing + && optab_handler (vashl_optab, mode) == CODE_FOR_nothing) + return false; + if (optab_handler (add_optab, qimode) == CODE_FOR_nothing) + return false; + } + + return true; } -/* Generate instructions for VEC_COND_EXPR given its type and three - operands. */ -rtx -expand_vec_perm_expr (tree type, tree v0, tree v1, tree mask, rtx target) +/* A subroutine of expand_vec_perm_expr for expanding one vec_perm insn. */ + +static rtx +expand_vec_perm_expr_1 (enum insn_code icode, rtx target, + rtx v0, rtx v1, rtx sel) { + enum machine_mode tmode = GET_MODE (target); + enum machine_mode smode = GET_MODE (sel); struct expand_operand ops[4]; - enum insn_code icode; - enum machine_mode mode = TYPE_MODE (type); - gcc_checking_assert (expand_vec_perm_expr_p (mode, v0, v1, mask)); + create_output_operand (&ops[0], target, tmode); + create_input_operand (&ops[3], sel, smode); - if (TREE_CODE (mask) == VECTOR_CST) + /* Make an effort to preserve v0 == v1. The target expander is able to + rely on this to determine if we're permuting a single input operand. */ + if (rtx_equal_p (v0, v1)) { - tree m_type, call; - tree fn = targetm.vectorize.builtin_vec_perm (TREE_TYPE (v0), &m_type); + if (!insn_operand_matches (icode, 1, v0)) + v0 = force_reg (tmode, v0); + gcc_checking_assert (insn_operand_matches (icode, 1, v0)); + gcc_checking_assert (insn_operand_matches (icode, 2, v0)); - if (!fn) - goto vec_perm; + create_fixed_operand (&ops[1], v0); + create_fixed_operand (&ops[2], v0); + } + else + { + create_input_operand (&ops[1], v0, tmode); + create_input_operand (&ops[2], v1, tmode); + } - if (m_type != TREE_TYPE (TREE_TYPE (mask))) - { - int units = TYPE_VECTOR_SUBPARTS (TREE_TYPE (mask)); - tree cvt = build_vector_type (m_type, units); - mask = fold_convert (cvt, mask); - } + if (maybe_expand_insn (icode, 4, ops)) + return ops[0].value; + return NULL_RTX; +} - call = fold_build1 (ADDR_EXPR, build_pointer_type (TREE_TYPE (fn)), fn); - call = build_call_nary (type, call, 3, v0, v1, mask); +/* Generate instructions for VEC_PERM_EXPR given its type and three + operands. */ +rtx +expand_vec_perm_expr (tree type, tree v0, tree v1, tree sel, rtx target) +{ + enum insn_code icode; + enum machine_mode mode = TYPE_MODE (type); + enum machine_mode qimode; + rtx v0_rtx, v1_rtx, sel_rtx, *vec, vt, tmp; + unsigned int i, w, e, u; - return expand_expr_real_1 (call, target, VOIDmode, EXPAND_NORMAL, NULL); + if (!target) + target = gen_reg_rtx (mode); + v0_rtx = expand_normal (v0); + if (operand_equal_p (v0, v1, 0)) + v1_rtx = v0_rtx; + else + v1_rtx = expand_normal (v1); + sel_rtx = expand_normal (sel); + + /* If the input is a constant, expand it specially. */ + if (CONSTANT_P (sel_rtx)) + { + icode = direct_optab_handler (vec_perm_const_optab, mode); + if (icode != CODE_FOR_nothing + && targetm.vectorize.builtin_vec_perm_ok (TREE_TYPE (v0), sel) + && (tmp = expand_vec_perm_expr_1 (icode, target, v0_rtx, + v1_rtx, sel_rtx)) != NULL) + return tmp; } - vec_perm: + /* Otherwise fall back to a fully variable permuation. */ icode = direct_optab_handler (vec_perm_optab, mode); + if (icode != CODE_FOR_nothing + && (tmp = expand_vec_perm_expr_1 (icode, target, v0_rtx, + v1_rtx, sel_rtx)) != NULL) + return tmp; + + /* As a special case to aid several targets, lower the element-based + permutation to a byte-based permutation and try again. */ + qimode = mode_for_vector (QImode, GET_MODE_SIZE (mode)); + if (!VECTOR_MODE_P (qimode)) + return NULL_RTX; + /* ??? For completeness, we ought to check the QImode version of + vec_perm_const_optab. But all users of this implicit lowering + feature implement the variable vec_perm_optab. */ + icode = direct_optab_handler (vec_perm_optab, qimode); if (icode == CODE_FOR_nothing) - return 0; + return NULL_RTX; - create_output_operand (&ops[0], target, mode); - create_input_operand (&ops[3], expand_normal (mask), - TYPE_MODE (TREE_TYPE (mask))); + w = GET_MODE_SIZE (mode); + e = GET_MODE_NUNITS (mode); + u = GET_MODE_UNIT_SIZE (mode); + vec = XALLOCAVEC (rtx, w); - if (operand_equal_p (v0, v1, 0)) + if (CONSTANT_P (sel_rtx)) { - rtx rtx_v0 = expand_normal (v0); - if (!insn_operand_matches (icode, 1, rtx_v0)) - rtx_v0 = force_reg (mode, rtx_v0); - - gcc_checking_assert (insn_operand_matches (icode, 2, rtx_v0)); + unsigned int j; + for (i = 0; i < e; ++i) + { + unsigned int this_e = INTVAL (XVECEXP (sel_rtx, 0, i)); + this_e &= 2 * e - 1; + this_e *= u; - create_fixed_operand (&ops[1], rtx_v0); - create_fixed_operand (&ops[2], rtx_v0); + for (j = 0; j < u; ++j) + vec[i * e + j] = GEN_INT (this_e + j); + } + sel_rtx = gen_rtx_CONST_VECTOR (qimode, gen_rtvec_v (w, vec)); } else { - create_input_operand (&ops[1], expand_normal (v0), mode); - create_input_operand (&ops[2], expand_normal (v1), mode); - } + /* Multiply each element by its byte size. */ + if (u == 2) + sel_rtx = expand_simple_binop (mode, PLUS, sel_rtx, sel_rtx, + sel_rtx, 0, OPTAB_DIRECT); + else + sel_rtx = expand_simple_binop (mode, ASHIFT, sel_rtx, + GEN_INT (exact_log2 (u)), + sel_rtx, 0, OPTAB_DIRECT); + gcc_assert (sel_rtx); - expand_insn (icode, 4, ops); - return ops[0].value; + /* Broadcast the low byte each element into each of its bytes. */ + for (i = 0; i < w; ++i) + { + int this_e = i / u * u; + if (BYTES_BIG_ENDIAN) + this_e += u - 1; + vec[i] = GEN_INT (this_e); + } + vt = gen_rtx_CONST_VECTOR (qimode, gen_rtvec_v (w, vec)); + sel_rtx = gen_lowpart (qimode, sel_rtx); + sel_rtx = expand_vec_perm_expr_1 (icode, gen_reg_rtx (qimode), + sel_rtx, sel_rtx, vt); + gcc_assert (sel_rtx != NULL); + + /* Add the byte offset to each byte element. */ + /* Note that the definition of the indicies here is memory ordering, + so there should be no difference between big and little endian. */ + for (i = 0; i < w; ++i) + vec[i] = GEN_INT (i % u); + vt = gen_rtx_CONST_VECTOR (qimode, gen_rtvec_v (w, vec)); + sel_rtx = expand_simple_binop (qimode, PLUS, sel_rtx, vt, + NULL_RTX, 0, OPTAB_DIRECT); + gcc_assert (sel_rtx); + } + + tmp = expand_vec_perm_expr_1 (icode, gen_lowpart (qimode, target), + gen_lowpart (qimode, v0_rtx), + gen_lowpart (qimode, v1_rtx), sel_rtx); + gcc_assert (tmp != NULL); + + return gen_lowpart (mode, tmp); } diff --git a/gcc/optabs.h b/gcc/optabs.h index 41ae7eb..926d21f 100644 --- a/gcc/optabs.h +++ b/gcc/optabs.h @@ -638,9 +638,6 @@ enum direct_optab_index DOI_reload_in, DOI_reload_out, - /* Vector shuffling. */ - DOI_vec_perm, - /* Block move operation. */ DOI_movmem, @@ -688,6 +685,10 @@ enum direct_optab_index /* Atomic clear with release semantics. */ DOI_sync_lock_release, + /* Vector permutation. */ + DOI_vec_perm, + DOI_vec_perm_const, + DOI_MAX }; @@ -704,7 +705,6 @@ typedef struct direct_optab_d *direct_optab; #endif #define reload_in_optab (&direct_optab_table[(int) DOI_reload_in]) #define reload_out_optab (&direct_optab_table[(int) DOI_reload_out]) -#define vec_perm_optab (&direct_optab_table[(int) DOI_vec_perm]) #define movmem_optab (&direct_optab_table[(int) DOI_movmem]) #define setmem_optab (&direct_optab_table[(int) DOI_setmem]) #define cmpstr_optab (&direct_optab_table[(int) DOI_cmpstr]) @@ -734,6 +734,8 @@ typedef struct direct_optab_d *direct_optab; (&direct_optab_table[(int) DOI_sync_lock_test_and_set]) #define sync_lock_release_optab \ (&direct_optab_table[(int) DOI_sync_lock_release]) +#define vec_perm_optab (&direct_optab_table[DOI_vec_perm]) +#define vec_perm_const_optab (&direct_optab_table[(int) DOI_vec_perm_const]) /* Target-dependent globals. */ struct target_optabs { @@ -889,7 +891,7 @@ extern rtx expand_vec_cond_expr (tree, tree, tree, tree, rtx); extern rtx expand_vec_shift_expr (sepops, rtx); /* Return tree if target supports vector operations for VEC_PERM_EXPR. */ -bool expand_vec_perm_expr_p (enum machine_mode, tree, tree, tree); +extern bool can_vec_perm_expr_p (tree, tree); /* Generate code for VEC_PERM_EXPR. */ extern rtx expand_vec_perm_expr (tree, tree, tree, tree, rtx); diff --git a/gcc/tree-vect-generic.c b/gcc/tree-vect-generic.c index 7fba9bb..663ea00 100644 --- a/gcc/tree-vect-generic.c +++ b/gcc/tree-vect-generic.c @@ -641,7 +641,7 @@ lower_vec_perm (gimple_stmt_iterator *gsi) location_t loc = gimple_location (gsi_stmt (*gsi)); unsigned i; - if (expand_vec_perm_expr_p (TYPE_MODE (vect_type), vec0, vec1, mask)) + if (can_vec_perm_expr_p (vect_type, mask)) return; warning_at (loc, OPT_Wvector_operation_performance, -- 1.7.6.4