Hi! This patch implements gather vectorization with -mavx2, if dr_may_alias (which apparently doesn't use tbaa :(( ) can figure out there is no overlap with stores in the loop (if any). The testcases show what is possible to get vectorized.
I chose to add 4 extra (internal only) gather builtins in addition to the 16 ones needed for the intrinsics, because the builtins using different sizes of the index vs. src/mask/ret vectors would complicate the generic code way too much (we don't have a VEC_SELECT_EXPR nor VEC_CONCAT_EXPR and interleaving/extract even/odd is undesirable here). With these 4 extra builtins the generic code always sees same sized src/mask/ret vs. index vectors, either they have same number of units, then just one vgather* insn is needed, or the index has more elements (int index and double/long long load) - then for one loaded index vector there is one vgather* insn using the first half of the index vector and one using the second half of that vector, or long index with float/int load, then two index vectors are processed by two vgather* insns and the result gets concatenated first halves of both results. All this is so far unconditional only, we'd need some tree representation of conditional loads resp. conditional stores (and could already with AVX use vmaskmov* insns for that). Bootstrapped/regtested on x86_64-linux and i686-linux, testcases tested also under sde. Ok for trunk? 2011-10-26 Jakub Jelinek <ja...@redhat.com> PR tree-optimization/50789 * tree-vect-stmts.c (process_use): Add force argument, avoid exist_non_indexing_operands_for_use_p check if true. (vect_mark_stmts_to_be_vectorized): Adjust callers. Handle STMT_VINFO_GATHER_P. (gen_perm_mask): New function. (perm_mask_for_reverse): Use it. (reverse_vec_element): Rename to... (permute_vec_elements): ... this. Add Y and MASK_VEC arguments, generalize for any permutations. (vectorizable_load): Adjust caller. Handle STMT_VINFO_GATHER_P. * target.def (TARGET_VECTORIZE_BUILTIN_GATHER): New hook. * doc/tm.texi.in (TARGET_VECTORIZE_BUILTIN_GATHER): Document it. * doc/tm.texi: Regenerate. * tree-data-ref.c (initialize_data_dependence_relation, compute_self_dependence): No longer static. * tree-data-ref.h (initialize_data_dependence_relation, compute_self_dependence): New prototypes. * tree-vect-data-refs.c (vect_check_gather): New function. (vect_analyze_data_refs): Detect possible gather load data refs. * tree-vectorizer.h (struct _stmt_vec_info): Add gather_p field. (STMT_VINFO_GATHER_P): Define. (vect_check_gather): New prototype. * config/i386/i386-builtin-types.def: Add types for alternate gather builtins. * config/i386/sse.md (AVXMODE48P_DI): Remove. (VEC_GATHER_MODE): Rename mode_attr to... (VEC_GATHER_IDXSI): ... this. (VEC_GATHER_IDXDI, VEC_GATHER_SRCDI): New mode_attrs. (avx2_gathersi<mode>, *avx2_gathersi<mode>): Use <VEC_GATHER_IDXSI> instead of <VEC_GATHER_MODE>. (avx2_gatherdi<mode>): Use <VEC_GATHER_IDXDI> instead of <<AVXMODE48P_DI> and <VEC_GATHER_SRCDI> instead of VEC_GATHER_MODE on src and mask operands. (*avx2_gatherdi<mode>): Likewise. Use VEC_GATHER_MODE iterator instead of AVXMODE48P_DI. (avx2_gatherdi<mode>256, *avx2_gatherdi<mode>256): Removed. * config/i386/i386.c (enum ix86_builtins): Add IX86_BUILTIN_GATHERALTSIV4DF, IX86_BUILTIN_GATHERALTDIV8SF, IX86_BUILTIN_GATHERALTSIV4DI and IX86_BUILTIN_GATHERALTDIV8SI. (ix86_init_mmx_sse_builtins): Create those builtins. (ix86_expand_builtin): Handle those builtins and adjust expansions of other gather builtins. (ix86_vectorize_builtin_gather): New function. (TARGET_VECTORIZE_BUILTIN_GATHER): Define. * gcc.target/i386/avx2-gather-1.c: New test. * gcc.target/i386/avx2-gather-2.c: New test. * gcc.target/i386/avx2-gather-3.c: New test. --- gcc/tree-vect-stmts.c.jj 2011-10-26 14:19:11.000000000 +0200 +++ gcc/tree-vect-stmts.c 2011-10-26 16:54:23.000000000 +0200 @@ -332,6 +332,8 @@ exist_non_indexing_operands_for_use_p (t - LIVE_P, RELEVANT - enum values to be set in the STMT_VINFO of the stmt that defined USE. This is done by calling mark_relevant and passing it the WORKLIST (to add DEF_STMT to the WORKLIST in case it is relevant). + - FORCE is true if exist_non_indexing_operands_for_use_p check shouldn't + be performed. Outputs: Generally, LIVE_P and RELEVANT are used to define the liveness and @@ -351,7 +353,8 @@ exist_non_indexing_operands_for_use_p (t static bool process_use (gimple stmt, tree use, loop_vec_info loop_vinfo, bool live_p, - enum vect_relevant relevant, VEC(gimple,heap) **worklist) + enum vect_relevant relevant, VEC(gimple,heap) **worklist, + bool force) { struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt); @@ -363,7 +366,7 @@ process_use (gimple stmt, tree use, loop /* case 1: we are only interested in uses that need to be vectorized. Uses that are used for address computation are not considered relevant. */ - if (!exist_non_indexing_operands_for_use_p (use, stmt)) + if (!force && !exist_non_indexing_operands_for_use_p (use, stmt)) return true; if (!vect_is_simple_use (use, loop_vinfo, NULL, &def_stmt, &def, &dt)) @@ -646,7 +649,7 @@ vect_mark_stmts_to_be_vectorized (loop_v break; } - if (is_pattern_stmt_p (vinfo_for_stmt (stmt))) + if (is_pattern_stmt_p (stmt_vinfo)) { /* Pattern statements are not inserted into the code, so FOR_EACH_PHI_OR_STMT_USE optimizes their operands out, and we @@ -660,9 +663,9 @@ vect_mark_stmts_to_be_vectorized (loop_v if (rhs_code == COND_EXPR && COMPARISON_CLASS_P (op)) { if (!process_use (stmt, TREE_OPERAND (op, 0), loop_vinfo, - live_p, relevant, &worklist) + live_p, relevant, &worklist, false) || !process_use (stmt, TREE_OPERAND (op, 1), loop_vinfo, - live_p, relevant, &worklist)) + live_p, relevant, &worklist, false)) { VEC_free (gimple, heap, worklist); return false; @@ -673,7 +676,7 @@ vect_mark_stmts_to_be_vectorized (loop_v { op = gimple_op (stmt, i); if (!process_use (stmt, op, loop_vinfo, live_p, relevant, - &worklist)) + &worklist, false)) { VEC_free (gimple, heap, worklist); return false; @@ -686,7 +689,7 @@ vect_mark_stmts_to_be_vectorized (loop_v { tree arg = gimple_call_arg (stmt, i); if (!process_use (stmt, arg, loop_vinfo, live_p, relevant, - &worklist)) + &worklist, false)) { VEC_free (gimple, heap, worklist); return false; @@ -699,12 +702,25 @@ vect_mark_stmts_to_be_vectorized (loop_v { tree op = USE_FROM_PTR (use_p); if (!process_use (stmt, op, loop_vinfo, live_p, relevant, - &worklist)) + &worklist, false)) { VEC_free (gimple, heap, worklist); return false; } } + + if (STMT_VINFO_GATHER_P (stmt_vinfo)) + { + tree off; + tree decl = vect_check_gather (stmt, loop_vinfo, NULL, &off, NULL); + gcc_assert (decl); + if (!process_use (stmt, off, loop_vinfo, live_p, relevant, + &worklist, true)) + { + VEC_free (gimple, heap, worklist); + return false; + } + } } /* while worklist */ VEC_free (gimple, heap, worklist); @@ -4082,23 +4098,17 @@ vectorizable_store (gimple stmt, gimple_ return true; } -/* Given a vector type VECTYPE returns a builtin DECL to be used - for vector permutation and returns the mask that implements - reversal of the vector elements. If that is impossible to do, - returns NULL. */ +/* Given a vector type VECTYPE and permutation SEL returns + the VECTOR_CST mask that implements the permutation of the + vector elements. If that is impossible to do, returns NULL. */ static tree -perm_mask_for_reverse (tree vectype) +gen_perm_mask (tree vectype, unsigned char *sel) { tree mask_elt_type, mask_type, mask_vec; int i, nunits; - unsigned char *sel; nunits = TYPE_VECTOR_SUBPARTS (vectype); - sel = XALLOCAVEC (unsigned char, nunits); - - for (i = 0; i < nunits; ++i) - sel[i] = nunits - 1 - i; if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel)) return NULL; @@ -4109,33 +4119,52 @@ perm_mask_for_reverse (tree vectype) mask_type = get_vectype_for_scalar_type (mask_elt_type); mask_vec = NULL; - for (i = 0; i < nunits; i++) - mask_vec = tree_cons (NULL, build_int_cst (mask_elt_type, i), mask_vec); + for (i = nunits - 1; i >= 0; i--) + mask_vec = tree_cons (NULL, build_int_cst (mask_elt_type, sel[i]), + mask_vec); mask_vec = build_vector (mask_type, mask_vec); return mask_vec; } -/* Given a vector variable X, that was generated for the scalar LHS of - STMT, generate instructions to reverse the vector elements of X, - insert them a *GSI and return the permuted vector variable. */ +/* Given a vector type VECTYPE returns the VECTOR_CST mask that implements + reversal of the vector elements. If that is impossible to do, + returns NULL. */ + +static tree +perm_mask_for_reverse (tree vectype) +{ + int i, nunits; + unsigned char *sel; + + nunits = TYPE_VECTOR_SUBPARTS (vectype); + sel = XALLOCAVEC (unsigned char, nunits); + + for (i = 0; i < nunits; ++i) + sel[i] = nunits - 1 - i; + + return gen_perm_mask (vectype, sel); +} + +/* Given a vector variable X and Y, that was generated for the scalar + STMT, generate instructions to permute the vector elements of X and Y + using permutation mask MASK_VEC, insert them at *GSI and return the + permuted vector variable. */ static tree -reverse_vec_elements (tree x, gimple stmt, gimple_stmt_iterator *gsi) +permute_vec_elements (tree x, tree y, tree mask_vec, gimple stmt, + gimple_stmt_iterator *gsi) { tree vectype = TREE_TYPE (x); - tree mask_vec, perm_dest, data_ref; + tree perm_dest, data_ref; gimple perm_stmt; - mask_vec = perm_mask_for_reverse (vectype); - perm_dest = vect_create_destination_var (gimple_assign_lhs (stmt), vectype); + data_ref = make_ssa_name (perm_dest, NULL); /* Generate the permute statement. */ - perm_stmt = gimple_build_assign_with_ops3 (VEC_PERM_EXPR, perm_dest, - x, x, mask_vec); - data_ref = make_ssa_name (perm_dest, perm_stmt); - gimple_set_lhs (perm_stmt, data_ref); + perm_stmt = gimple_build_assign_with_ops3 (VEC_PERM_EXPR, data_ref, + x, y, mask_vec); vect_finish_stmt_generation (stmt, perm_stmt, gsi); return data_ref; @@ -4194,6 +4223,10 @@ vectorizable_load (gimple stmt, gimple_s bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info); int vf; tree aggr_type; + tree gather_base = NULL_TREE, gather_off = NULL_TREE; + tree gather_off_vectype = NULL_TREE, gather_decl = NULL_TREE; + int gather_scale = 1; + enum vect_def_type gather_dt = vect_unknown_def_type; if (loop_vinfo) { @@ -4274,7 +4307,7 @@ vectorizable_load (gimple stmt, gimple_s { strided_load = true; /* FORNOW */ - gcc_assert (! nested_in_vect_loop); + gcc_assert (! nested_in_vect_loop && !STMT_VINFO_GATHER_P (stmt_info)); first_stmt = GROUP_FIRST_ELEMENT (stmt_info); if (!slp && !PURE_SLP_STMT (stmt_info)) @@ -4289,7 +4322,7 @@ vectorizable_load (gimple stmt, gimple_s if (negative) { - gcc_assert (!strided_load); + gcc_assert (!strided_load && !STMT_VINFO_GATHER_P (stmt_info)); alignment_support_scheme = vect_supportable_dr_alignment (dr, false); if (alignment_support_scheme != dr_aligned && alignment_support_scheme != dr_unaligned_supported) @@ -4306,6 +4339,23 @@ vectorizable_load (gimple stmt, gimple_s } } + if (STMT_VINFO_GATHER_P (stmt_info)) + { + gimple def_stmt; + tree def; + gather_decl = vect_check_gather (stmt, loop_vinfo, &gather_base, + &gather_off, &gather_scale); + gcc_assert (gather_decl); + if (!vect_is_simple_use_1 (gather_off, loop_vinfo, bb_vinfo, + &def_stmt, &def, &gather_dt, + &gather_off_vectype)) + { + if (vect_print_dump_info (REPORT_DETAILS)) + fprintf (vect_dump, "gather index use not simple."); + return false; + } + } + if (!vec_stmt) /* transformation not required. */ { STMT_VINFO_TYPE (stmt_info) = load_vec_info_type; @@ -4318,6 +4368,163 @@ vectorizable_load (gimple stmt, gimple_s /** Transform. **/ + if (STMT_VINFO_GATHER_P (stmt_info)) + { + tree vec_oprnd0 = NULL_TREE, op; + tree arglist = TYPE_ARG_TYPES (TREE_TYPE (gather_decl)); + tree rettype, srctype, ptrtype, idxtype, masktype, scaletype; + tree ptr, mask, var, scale, perm_mask = NULL_TREE, prev_res = NULL_TREE; + edge pe = loop_preheader_edge (loop); + gimple_seq seq; + basic_block new_bb; + enum { NARROW, NONE, WIDEN } modifier; + int gather_off_nunits = TYPE_VECTOR_SUBPARTS (gather_off_vectype); + + if (nunits == gather_off_nunits) + modifier = NONE; + else if (nunits == gather_off_nunits / 2) + { + unsigned char *sel = XALLOCAVEC (unsigned char, gather_off_nunits); + modifier = WIDEN; + + for (i = 0; i < gather_off_nunits; ++i) + sel[i] = i | nunits; + + perm_mask = gen_perm_mask (gather_off_vectype, sel); + gcc_assert (perm_mask != NULL_TREE); + } + else if (nunits == gather_off_nunits * 2) + { + unsigned char *sel = XALLOCAVEC (unsigned char, nunits); + modifier = NARROW; + + for (i = 0; i < nunits; ++i) + sel[i] = i < gather_off_nunits + ? i : i + nunits - gather_off_nunits; + + perm_mask = gen_perm_mask (vectype, sel); + gcc_assert (perm_mask != NULL_TREE); + ncopies *= 2; + } + else + gcc_unreachable (); + + rettype = TREE_TYPE (TREE_TYPE (gather_decl)); + srctype = TREE_VALUE (arglist); arglist = TREE_CHAIN (arglist); + ptrtype = TREE_VALUE (arglist); arglist = TREE_CHAIN (arglist); + idxtype = TREE_VALUE (arglist); arglist = TREE_CHAIN (arglist); + masktype = TREE_VALUE (arglist); arglist = TREE_CHAIN (arglist); + scaletype = TREE_VALUE (arglist); + gcc_checking_assert (types_compatible_p (srctype, rettype) + && types_compatible_p (srctype, masktype)); + + vec_dest = vect_create_destination_var (scalar_dest, vectype); + + ptr = fold_convert (ptrtype, gather_base); + ptr = fold_build2 (POINTER_PLUS_EXPR, ptrtype, ptr, + fold_convert (sizetype, DR_INIT (dr))); + if (!is_gimple_min_invariant (ptr)) + { + ptr = force_gimple_operand (ptr, &seq, true, NULL_TREE); + new_bb = gsi_insert_seq_on_edge_immediate (pe, seq); + gcc_assert (!new_bb); + } + + /* Currently we support only unconditional gather loads, + so mask should be all ones. */ + if (TREE_CODE (TREE_TYPE (masktype)) == INTEGER_TYPE) + mask = build_int_cst (TREE_TYPE (masktype), -1); + else if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (masktype))) + { + REAL_VALUE_TYPE r; + long tmp[6]; + for (j = 0; j < 6; ++j) + tmp[j] = -1; + real_from_target (&r, tmp, TYPE_MODE (TREE_TYPE (masktype))); + mask = build_real (TREE_TYPE (masktype), r); + } + else + gcc_unreachable (); + mask = build_vector_from_val (masktype, mask); + mask = vect_init_vector (stmt, mask, masktype, NULL); + + scale = build_int_cst (scaletype, gather_scale); + + prev_stmt_info = NULL; + for (j = 0; j < ncopies; ++j) + { + if (modifier == WIDEN && (j & 1)) + op = permute_vec_elements (vec_oprnd0, vec_oprnd0, + perm_mask, stmt, gsi); + else if (j == 0) + op = vec_oprnd0 + = vect_get_vec_def_for_operand (gather_off, stmt, NULL); + else + op = vec_oprnd0 + = vect_get_vec_def_for_stmt_copy (gather_dt, vec_oprnd0); + + if (!useless_type_conversion_p (idxtype, TREE_TYPE (op))) + { + gcc_assert (TYPE_VECTOR_SUBPARTS (TREE_TYPE (op)) + == TYPE_VECTOR_SUBPARTS (idxtype)); + var = vect_get_new_vect_var (idxtype, vect_simple_var, NULL); + add_referenced_var (var); + var = make_ssa_name (var, NULL); + op = build1 (VIEW_CONVERT_EXPR, idxtype, op); + new_stmt + = gimple_build_assign_with_ops (VIEW_CONVERT_EXPR, var, + op, NULL_TREE); + vect_finish_stmt_generation (stmt, new_stmt, gsi); + op = var; + } + + new_stmt + = gimple_build_call (gather_decl, 5, mask, ptr, op, mask, scale); + + if (!useless_type_conversion_p (vectype, rettype)) + { + gcc_assert (TYPE_VECTOR_SUBPARTS (vectype) + == TYPE_VECTOR_SUBPARTS (rettype)); + var = vect_get_new_vect_var (rettype, vect_simple_var, NULL); + add_referenced_var (var); + op = make_ssa_name (var, new_stmt); + gimple_call_set_lhs (new_stmt, op); + vect_finish_stmt_generation (stmt, new_stmt, gsi); + var = make_ssa_name (vec_dest, NULL); + op = build1 (VIEW_CONVERT_EXPR, vectype, op); + new_stmt + = gimple_build_assign_with_ops (VIEW_CONVERT_EXPR, var, op, + NULL_TREE); + } + else + { + var = make_ssa_name (vec_dest, new_stmt); + gimple_call_set_lhs (new_stmt, var); + } + + vect_finish_stmt_generation (stmt, new_stmt, gsi); + + if (modifier == NARROW) + { + if ((j & 1) == 0) + { + prev_res = var; + continue; + } + var = permute_vec_elements (prev_res, var, + perm_mask, stmt, gsi); + new_stmt = SSA_NAME_DEF_STMT (var); + } + + if (prev_stmt_info == NULL) + STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt; + else + STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt; + prev_stmt_info = vinfo_for_stmt (new_stmt); + } + return true; + } + if (strided_load) { first_stmt = GROUP_FIRST_ELEMENT (stmt_info); @@ -4700,7 +4907,9 @@ vectorizable_load (gimple stmt, gimple_s if (negative) { - new_temp = reverse_vec_elements (new_temp, stmt, gsi); + tree perm_mask = perm_mask_for_reverse (vectype); + new_temp = permute_vec_elements (new_temp, new_temp, + perm_mask, stmt, gsi); new_stmt = SSA_NAME_DEF_STMT (new_temp); } --- gcc/target.def.jj 2011-10-26 14:19:11.000000000 +0200 +++ gcc/target.def 2011-10-26 14:21:10.000000000 +0200 @@ -1021,6 +1021,14 @@ DEFHOOK (void), default_autovectorize_vector_sizes) +/* Target builtin that implements vector gather operation. */ +DEFHOOK +(builtin_gather, + "", + tree, + (const_tree mem_vectype, const_tree index_type, int scale), + NULL) + HOOK_VECTOR_END (vectorize) #undef HOOK_PREFIX --- gcc/tree-data-ref.c.jj 2011-10-26 14:19:03.000000000 +0200 +++ gcc/tree-data-ref.c 2011-10-26 14:21:10.000000000 +0200 @@ -1351,13 +1351,11 @@ dr_may_alias_p (const struct data_refere return refs_may_alias_p (addr_a, addr_b); } -static void compute_self_dependence (struct data_dependence_relation *); - /* Initialize a data dependence relation between data accesses A and B. NB_LOOPS is the number of loops surrounding the references: the size of the classic distance/direction vectors. */ -static struct data_dependence_relation * +struct data_dependence_relation * initialize_data_dependence_relation (struct data_reference *a, struct data_reference *b, VEC (loop_p, heap) *loop_nest) @@ -4121,7 +4119,7 @@ compute_affine_dependence (struct data_d /* This computes the dependence relation for the same data reference into DDR. */ -static void +void compute_self_dependence (struct data_dependence_relation *ddr) { unsigned int i; --- gcc/tree-data-ref.h.jj 2011-10-26 14:19:03.000000000 +0200 +++ gcc/tree-data-ref.h 2011-10-26 14:21:10.000000000 +0200 @@ -423,6 +423,9 @@ extern bool graphite_find_data_reference VEC (data_reference_p, heap) **); struct data_reference *create_data_ref (loop_p, loop_p, tree, gimple, bool); extern bool find_loop_nest (struct loop *, VEC (loop_p, heap) **); +extern struct data_dependence_relation *initialize_data_dependence_relation + (struct data_reference *, struct data_reference *, VEC (loop_p, heap) *); +extern void compute_self_dependence (struct data_dependence_relation *); extern void compute_all_dependences (VEC (data_reference_p, heap) *, VEC (ddr_p, heap) **, VEC (loop_p, heap) *, bool); --- gcc/doc/tm.texi.in.jj 2011-10-26 14:19:02.000000000 +0200 +++ gcc/doc/tm.texi.in 2011-10-26 14:21:10.000000000 +0200 @@ -5696,6 +5696,14 @@ mode returned by @code{TARGET_VECTORIZE_ The default is zero which means to not iterate over other vector sizes. @end deftypefn +@hook TARGET_VECTORIZE_BUILTIN_GATHER +Target builtin that implements vector gather operation. @var{mem_vectype} +is the vector type of the load and @var{index_type} is scalar type of +the index, scaled by @var{scale}. +The default is @code{NULL_TREE} which means to not vectorize gather +loads. +@end deftypefn + @node Anchored Addresses @section Anchored Addresses @cindex anchored addresses --- gcc/doc/tm.texi.jj 2011-10-26 14:19:04.000000000 +0200 +++ gcc/doc/tm.texi 2011-10-26 14:21:10.000000000 +0200 @@ -5758,6 +5758,14 @@ mode returned by @code{TARGET_VECTORIZE_ The default is zero which means to not iterate over other vector sizes. @end deftypefn +@deftypefn {Target Hook} tree TARGET_VECTORIZE_BUILTIN_GATHER (const_tree @var{mem_vectype}, const_tree @var{index_type}, int @var{scale}) +Target builtin that implements vector gather operation. @var{mem_vectype} +is the vector type of the load and @var{index_type} is scalar type of +the index, scaled by @var{scale}. +The default is @code{NULL_TREE} which means to not vectorize gather +loads. +@end deftypefn + @node Anchored Addresses @section Anchored Addresses @cindex anchored addresses --- gcc/tree-vect-data-refs.c.jj 2011-10-26 14:19:11.000000000 +0200 +++ gcc/tree-vect-data-refs.c 2011-10-26 17:14:16.000000000 +0200 @@ -2497,6 +2497,135 @@ vect_prune_runtime_alias_test_list (loop return true; } +/* Check whether a non-affine read in stmt is suitable for gather load + and if so, return a builtin decl for that operation. */ + +tree +vect_check_gather (gimple stmt, loop_vec_info loop_vinfo, tree *basep, + tree *offp, int *scalep) +{ + HOST_WIDE_INT scale = 1; + struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); + stmt_vec_info stmt_info = vinfo_for_stmt (stmt); + struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info); + tree offtype = NULL_TREE; + tree base = DR_BASE_ADDRESS (dr); + tree off = DR_OFFSET (dr); + tree decl; + + if (TREE_CODE (base) == POINTER_PLUS_EXPR + && integer_zerop (off) + && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME + && !chrec_contains_symbols_defined_in_loop (TREE_OPERAND (base, 0), + loop->num)) + { + off = TREE_OPERAND (base, 1); + base = TREE_OPERAND (base, 0); + } + if (is_gimple_min_invariant (base) + || (TREE_CODE (base) == SSA_NAME + && !chrec_contains_symbols_defined_in_loop (base, loop->num))) + { + /* DR_BASE_ADDRESS + DR_INIT would go into the constant + pointer, DR_OFFSET into vector. */ + STRIP_NOPS (off); + if (TREE_CODE (off) == MULT_EXPR + && host_integerp (TREE_OPERAND (off, 1), 0)) + { + scale = tree_low_cst (TREE_OPERAND (off, 1), 0); + if (scale != (int) scale) + return NULL_TREE; + off = TREE_OPERAND (off, 0); + } + if (CONVERT_EXPR_P (off)) + { + tree op = TREE_OPERAND (off, 0); + if (TREE_CODE (TREE_TYPE (op)) == INTEGER_TYPE + && TYPE_PRECISION (TREE_TYPE (op)) + == GET_MODE_PRECISION (TYPE_MODE (TREE_TYPE (op))) + && TYPE_PRECISION (TREE_TYPE (op)) + < TYPE_PRECISION (TREE_TYPE (off))) + { + off = op; + offtype = TREE_TYPE (off); + } + } + STRIP_NOPS (off); + switch (TREE_CODE (off)) + { + case SSA_NAME: + break; + case PLUS_EXPR: + case MINUS_EXPR: + if (TREE_CODE (TREE_OPERAND (off, 0)) == SSA_NAME + && TREE_CODE (TREE_OPERAND (off, 1)) == SSA_NAME) + { + bool c0 + = chrec_contains_symbols_defined_in_loop (TREE_OPERAND (off, + 0), + loop->num); + bool c1 + = chrec_contains_symbols_defined_in_loop (TREE_OPERAND (off, + 1), + loop->num); + if (c0 && !c1) + { + tree bo = fold_convert (sizetype, TREE_OPERAND (off, 1)); + if (scale != 1) + bo = size_binop (MULT_EXPR, bo, size_int (scale)); + if (TREE_CODE (off) == MINUS_EXPR) + bo = fold_build1 (NEGATE_EXPR, sizetype, bo); + base = fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (base), + base, bo); + off = TREE_OPERAND (off, 0); + break; + } + if (!c0 && c1 && TREE_CODE (off) == PLUS_EXPR) + { + tree bo = fold_convert (sizetype, TREE_OPERAND (off, 0)); + if (scale != 1) + bo = size_binop (MULT_EXPR, bo, size_int (scale)); + base = fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (base), + base, bo); + off = TREE_OPERAND (off, 1); + break; + } + } + return NULL_TREE; + default: + return NULL_TREE; + } + } + else + { + if (!integer_zerop (off)) + return NULL_TREE; + else if (TREE_CODE (base) != SSA_NAME) + return NULL_TREE; + off = base; + base = build_int_cst (TREE_TYPE (DR_BASE_ADDRESS (dr)), 0); + } + + if (!chrec_contains_symbols_defined_in_loop (off, loop->num)) + return NULL_TREE; + + if (offtype == NULL_TREE) + offtype = TREE_TYPE (off); + + decl = targetm.vectorize.builtin_gather (STMT_VINFO_VECTYPE (stmt_info), + offtype, scale); + if (decl == NULL_TREE) + return NULL_TREE; + + if (basep) + *basep = base; + if (offp) + *offp = off; + if (scalep) + *scalep = scale; + return decl; +} + /* Function vect_analyze_data_refs. @@ -2573,6 +2702,7 @@ vect_analyze_data_refs (loop_vec_info lo gimple stmt; stmt_vec_info stmt_info; tree base, offset, init; + bool gather = false; int vf; if (!dr || !DR_REF (dr)) @@ -2594,22 +2724,51 @@ vect_analyze_data_refs (loop_vec_info lo /* Check that analysis of the data-ref succeeded. */ if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr) - || !DR_STEP (dr)) + || !DR_STEP (dr)) { - if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS)) - { - fprintf (vect_dump, "not vectorized: data ref analysis failed "); - print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM); - } + /* If target supports vector gather loads, see if they can't + be used. */ + if (loop_vinfo + && DR_IS_READ (dr) + && !TREE_THIS_VOLATILE (DR_REF (dr)) + && targetm.vectorize.builtin_gather != NULL + && !nested_in_vect_loop_p (loop, stmt)) + { + struct data_reference *newdr + = create_data_ref (NULL, loop_containing_stmt (stmt), + DR_REF (dr), stmt, true); + gcc_assert (newdr != NULL && DR_REF (newdr)); + if (DR_BASE_ADDRESS (newdr) + && DR_OFFSET (newdr) + && DR_INIT (newdr) + && DR_STEP (newdr) + && integer_zerop (DR_STEP (newdr))) + { + dr = newdr; + gather = true; + } + else + free_data_ref (newdr); + } - if (bb_vinfo) - { - STMT_VINFO_VECTORIZABLE (stmt_info) = false; - stop_bb_analysis = true; - continue; - } + if (!gather) + { + if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS)) + { + fprintf (vect_dump, "not vectorized: data ref analysis " + "failed "); + print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM); + } + + if (bb_vinfo) + { + STMT_VINFO_VECTORIZABLE (stmt_info) = false; + stop_bb_analysis = true; + continue; + } - return false; + return false; + } } if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST) @@ -2625,7 +2784,9 @@ vect_analyze_data_refs (loop_vec_info lo continue; } - return false; + if (gather) + free_data_ref (dr); + return false; } if (TREE_THIS_VOLATILE (DR_REF (dr))) @@ -2666,6 +2827,8 @@ vect_analyze_data_refs (loop_vec_info lo continue; } + if (gather) + free_data_ref (dr); return false; } @@ -2791,6 +2954,8 @@ vect_analyze_data_refs (loop_vec_info lo continue; } + if (gather) + free_data_ref (dr); return false; } @@ -2818,8 +2983,13 @@ vect_analyze_data_refs (loop_vec_info lo stop_bb_analysis = true; continue; } - else - return false; + + if (gather) + { + STMT_VINFO_DATA_REF (stmt_info) = NULL; + free_data_ref (dr); + } + return false; } /* Adjust the minimal vectorization factor according to the @@ -2827,6 +2997,86 @@ vect_analyze_data_refs (loop_vec_info lo vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)); if (vf > *min_vf) *min_vf = vf; + + if (gather) + { + unsigned int j, k, n; + struct data_reference *olddr + = VEC_index (data_reference_p, datarefs, i); + VEC (ddr_p, heap) *ddrs = LOOP_VINFO_DDRS (loop_vinfo); + struct data_dependence_relation *ddr, *newddr; + bool bad = false; + tree off; + VEC (loop_p, heap) *nest = LOOP_VINFO_LOOP_NEST (loop_vinfo); + + if (!vect_check_gather (stmt, loop_vinfo, NULL, &off, NULL) + || get_vectype_for_scalar_type (TREE_TYPE (off)) == NULL_TREE) + { + if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS)) + { + fprintf (vect_dump, + "not vectorized: not suitable for gather "); + print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM); + } + return false; + } + + n = VEC_length (data_reference_p, datarefs) - 1; + for (j = 0, k = i - 1; j < i; j++) + { + ddr = VEC_index (ddr_p, ddrs, k); + gcc_assert (DDR_B (ddr) == olddr); + newddr = initialize_data_dependence_relation (DDR_A (ddr), dr, + nest); + VEC_replace (ddr_p, ddrs, k, newddr); + free_dependence_relation (ddr); + if (!bad + && DR_IS_WRITE (DDR_A (newddr)) + && DDR_ARE_DEPENDENT (newddr) != chrec_known) + bad = true; + k += --n; + } + + k++; + n = k + VEC_length (data_reference_p, datarefs) - i - 1; + for (; k < n; k++) + { + ddr = VEC_index (ddr_p, ddrs, k); + gcc_assert (DDR_A (ddr) == olddr); + newddr = initialize_data_dependence_relation (dr, DDR_B (ddr), + nest); + VEC_replace (ddr_p, ddrs, k, newddr); + free_dependence_relation (ddr); + if (!bad + && DR_IS_WRITE (DDR_B (newddr)) + && DDR_ARE_DEPENDENT (newddr) != chrec_known) + bad = true; + } + + k = VEC_length (ddr_p, ddrs) + - VEC_length (data_reference_p, datarefs) + i; + ddr = VEC_index (ddr_p, ddrs, k); + gcc_assert (DDR_A (ddr) == olddr && DDR_B (ddr) == olddr); + newddr = initialize_data_dependence_relation (dr, dr, nest); + compute_self_dependence (newddr); + VEC_replace (ddr_p, ddrs, k, newddr); + free_dependence_relation (ddr); + VEC_replace (data_reference_p, datarefs, i, dr); + + if (bad) + { + if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS)) + { + fprintf (vect_dump, + "not vectorized: data dependence conflict" + " prevents gather"); + print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM); + } + return false; + } + + STMT_VINFO_GATHER_P (stmt_info) = true; + } } return true; --- gcc/tree-vectorizer.h.jj 2011-10-26 14:19:03.000000000 +0200 +++ gcc/tree-vectorizer.h 2011-10-26 14:21:10.000000000 +0200 @@ -517,6 +517,9 @@ typedef struct _stmt_vec_info { /* Is this statement vectorizable or should it be skipped in (partial) vectorization. */ bool vectorizable; + + /* For loads only, true if this is a gather load. */ + bool gather_p; } *stmt_vec_info; /* Access Functions. */ @@ -530,6 +533,7 @@ typedef struct _stmt_vec_info { #define STMT_VINFO_VEC_STMT(S) (S)->vectorized_stmt #define STMT_VINFO_VECTORIZABLE(S) (S)->vectorizable #define STMT_VINFO_DATA_REF(S) (S)->data_ref_info +#define STMT_VINFO_GATHER_P(S) (S)->gather_p #define STMT_VINFO_DR_BASE_ADDRESS(S) (S)->dr_base_address #define STMT_VINFO_DR_INIT(S) (S)->dr_init @@ -838,6 +842,8 @@ extern bool vect_analyze_data_refs_align extern bool vect_verify_datarefs_alignment (loop_vec_info, bb_vec_info); extern bool vect_analyze_data_ref_accesses (loop_vec_info, bb_vec_info); extern bool vect_prune_runtime_alias_test_list (loop_vec_info); +extern tree vect_check_gather (gimple, loop_vec_info, tree *, tree *, + int *); extern bool vect_analyze_data_refs (loop_vec_info, bb_vec_info, int *); extern tree vect_create_data_ref_ptr (gimple, tree, struct loop *, tree, tree *, gimple_stmt_iterator *, --- gcc/config/i386/i386-builtin-types.def.jj 2011-08-26 18:41:43.000000000 +0200 +++ gcc/config/i386/i386-builtin-types.def 2011-10-26 14:24:59.000000000 +0200 @@ -432,20 +432,24 @@ DEF_FUNCTION_TYPE (V8QI, QI, QI, QI, QI, DEF_FUNCTION_TYPE (V2DF, V2DF, PCDOUBLE, V4SI, V2DF, INT) DEF_FUNCTION_TYPE (V4DF, V4DF, PCDOUBLE, V4SI, V4DF, INT) +DEF_FUNCTION_TYPE (V4DF, V4DF, PCDOUBLE, V8SI, V4DF, INT) DEF_FUNCTION_TYPE (V2DF, V2DF, PCDOUBLE, V2DI, V2DF, INT) DEF_FUNCTION_TYPE (V4DF, V4DF, PCDOUBLE, V4DI, V4DF, INT) DEF_FUNCTION_TYPE (V4SF, V4SF, PCFLOAT, V4SI, V4SF, INT) DEF_FUNCTION_TYPE (V8SF, V8SF, PCFLOAT, V8SI, V8SF, INT) DEF_FUNCTION_TYPE (V4SF, V4SF, PCFLOAT, V2DI, V4SF, INT) DEF_FUNCTION_TYPE (V4SF, V4SF, PCFLOAT, V4DI, V4SF, INT) +DEF_FUNCTION_TYPE (V8SF, V8SF, PCFLOAT, V4DI, V8SF, INT) DEF_FUNCTION_TYPE (V2DI, V2DI, PCINT64, V4SI, V2DI, INT) DEF_FUNCTION_TYPE (V4DI, V4DI, PCINT64, V4SI, V4DI, INT) +DEF_FUNCTION_TYPE (V4DI, V4DI, PCINT64, V8SI, V4DI, INT) DEF_FUNCTION_TYPE (V2DI, V2DI, PCINT64, V2DI, V2DI, INT) DEF_FUNCTION_TYPE (V4DI, V4DI, PCINT64, V4DI, V4DI, INT) DEF_FUNCTION_TYPE (V4SI, V4SI, PCINT, V4SI, V4SI, INT) DEF_FUNCTION_TYPE (V8SI, V8SI, PCINT, V8SI, V8SI, INT) DEF_FUNCTION_TYPE (V4SI, V4SI, PCINT, V2DI, V4SI, INT) DEF_FUNCTION_TYPE (V4SI, V4SI, PCINT, V4DI, V4SI, INT) +DEF_FUNCTION_TYPE (V8SI, V8SI, PCINT, V4DI, V8SI, INT) DEF_FUNCTION_TYPE_ALIAS (V2DF_FTYPE_V2DF, ROUND) DEF_FUNCTION_TYPE_ALIAS (V4DF_FTYPE_V4DF, ROUND) --- gcc/config/i386/sse.md.jj 2011-10-26 14:19:11.000000000 +0200 +++ gcc/config/i386/sse.md 2011-10-26 14:21:10.000000000 +0200 @@ -314,14 +314,6 @@ (define_mode_attr i128 ;; Mix-n-match (define_mode_iterator AVX256MODE2P [V8SI V8SF V4DF]) -(define_mode_iterator AVXMODE48P_DI - [V2DI V2DF V4DI V4DF V4SF V4SI]) -(define_mode_attr AVXMODE48P_DI - [(V2DI "V2DI") (V2DF "V2DI") - (V4DI "V4DI") (V4DF "V4DI") - (V4SI "V2DI") (V4SF "V2DI") - (V8SI "V4DI") (V8SF "V4DI")]) - (define_mode_iterator FMAMODE [SF DF V4SF V2DF V8SF V4DF]) ;; Mapping of immediate bits for blend instructions @@ -12683,11 +12675,21 @@ (define_insn "vcvtps2ph256" ;; For gather* insn patterns (define_mode_iterator VEC_GATHER_MODE [V2DI V2DF V4DI V4DF V4SI V4SF V8SI V8SF]) -(define_mode_attr VEC_GATHER_MODE +(define_mode_attr VEC_GATHER_IDXSI [(V2DI "V4SI") (V2DF "V4SI") (V4DI "V4SI") (V4DF "V4SI") (V4SI "V4SI") (V4SF "V4SI") (V8SI "V8SI") (V8SF "V8SI")]) +(define_mode_attr VEC_GATHER_IDXDI + [(V2DI "V2DI") (V2DF "V2DI") + (V4DI "V4DI") (V4DF "V4DI") + (V4SI "V2DI") (V4SF "V2DI") + (V8SI "V4DI") (V8SF "V4DI")]) +(define_mode_attr VEC_GATHER_SRCDI + [(V2DI "V2DI") (V2DF "V2DF") + (V4DI "V4DI") (V4DF "V4DF") + (V4SI "V4SI") (V4SF "V4SF") + (V8SI "V4SI") (V8SF "V4SF")]) (define_expand "avx2_gathersi<mode>" [(parallel [(set (match_operand:VEC_GATHER_MODE 0 "register_operand" "") @@ -12696,7 +12698,8 @@ (define_expand "avx2_gathersi<mode>" (mem:<ssescalarmode> (match_par_dup 7 [(match_operand 2 "vsib_address_operand" "") - (match_operand:<VEC_GATHER_MODE> 3 "register_operand" "") + (match_operand:<VEC_GATHER_IDXSI> + 3 "register_operand" "") (match_operand:SI 5 "const1248_operand " "")])) (mem:BLK (scratch)) (match_operand:VEC_GATHER_MODE 4 "register_operand" "")] @@ -12716,7 +12719,7 @@ (define_insn "*avx2_gathersi<mode>" (match_operator:<ssescalarmode> 7 "vsib_mem_operator" [(unspec:P [(match_operand:P 3 "vsib_address_operand" "p") - (match_operand:<VEC_GATHER_MODE> 4 "register_operand" "x") + (match_operand:<VEC_GATHER_IDXSI> 4 "register_operand" "x") (match_operand:SI 6 "const1248_operand" "n")] UNSPEC_VSIBADDR)]) (mem:BLK (scratch)) @@ -12732,14 +12735,16 @@ (define_insn "*avx2_gathersi<mode>" (define_expand "avx2_gatherdi<mode>" [(parallel [(set (match_operand:VEC_GATHER_MODE 0 "register_operand" "") (unspec:VEC_GATHER_MODE - [(match_operand:VEC_GATHER_MODE 1 "register_operand" "") + [(match_operand:<VEC_GATHER_SRCDI> 1 "register_operand" "") (mem:<ssescalarmode> (match_par_dup 7 [(match_operand 2 "vsib_address_operand" "") - (match_operand:<AVXMODE48P_DI> 3 "register_operand" "") + (match_operand:<VEC_GATHER_IDXDI> + 3 "register_operand" "") (match_operand:SI 5 "const1248_operand " "")])) (mem:BLK (scratch)) - (match_operand:VEC_GATHER_MODE 4 "register_operand" "")] + (match_operand:<VEC_GATHER_SRCDI> + 4 "register_operand" "")] UNSPEC_GATHER)) (clobber (match_scratch:VEC_GATHER_MODE 6 ""))])] "TARGET_AVX2" @@ -12750,63 +12755,21 @@ (define_expand "avx2_gatherdi<mode>" }) (define_insn "*avx2_gatherdi<mode>" - [(set (match_operand:AVXMODE48P_DI 0 "register_operand" "=&x") - (unspec:AVXMODE48P_DI - [(match_operand:AVXMODE48P_DI 2 "register_operand" "0") + [(set (match_operand:VEC_GATHER_MODE 0 "register_operand" "=&x") + (unspec:VEC_GATHER_MODE + [(match_operand:<VEC_GATHER_SRCDI> 2 "register_operand" "0") (match_operator:<ssescalarmode> 7 "vsib_mem_operator" [(unspec:P [(match_operand:P 3 "vsib_address_operand" "p") - (match_operand:<AVXMODE48P_DI> 4 "register_operand" "x") + (match_operand:<VEC_GATHER_IDXDI> 4 "register_operand" "x") (match_operand:SI 6 "const1248_operand" "n")] UNSPEC_VSIBADDR)]) (mem:BLK (scratch)) - (match_operand:AVXMODE48P_DI 5 "register_operand" "1")] + (match_operand:<VEC_GATHER_SRCDI> 5 "register_operand" "1")] UNSPEC_GATHER)) - (clobber (match_scratch:AVXMODE48P_DI 1 "=&x"))] - "TARGET_AVX2" - "v<sseintprefix>gatherq<ssemodesuffix>\t{%1, %7, %0|%0, %7, %1}" - [(set_attr "type" "ssemov") - (set_attr "prefix" "vex") - (set_attr "mode" "<sseinsnmode>")]) - -;; Special handling for VEX.256 with float arguments -;; since there're still xmms as operands -(define_expand "avx2_gatherdi<mode>256" - [(parallel [(set (match_operand:VI4F_128 0 "register_operand" "") - (unspec:VI4F_128 - [(match_operand:VI4F_128 1 "register_operand" "") - (mem:<ssescalarmode> - (match_par_dup 7 - [(match_operand 2 "vsib_address_operand" "") - (match_operand:V4DI 3 "register_operand" "") - (match_operand:SI 5 "const1248_operand " "")])) - (mem:BLK (scratch)) - (match_operand:VI4F_128 4 "register_operand" "")] - UNSPEC_GATHER)) - (clobber (match_scratch:VI4F_128 6 ""))])] - "TARGET_AVX2" -{ - operands[7] - = gen_rtx_UNSPEC (Pmode, gen_rtvec (3, operands[2], operands[3], - operands[5]), UNSPEC_VSIBADDR); -}) - -(define_insn "*avx2_gatherdi<mode>256" - [(set (match_operand:VI4F_128 0 "register_operand" "=x") - (unspec:VI4F_128 - [(match_operand:VI4F_128 2 "register_operand" "0") - (match_operator:<ssescalarmode> 7 "vsib_mem_operator" - [(unspec:P - [(match_operand:P 3 "vsib_address_operand" "p") - (match_operand:V4DI 4 "register_operand" "x") - (match_operand:SI 6 "const1248_operand" "n")] - UNSPEC_VSIBADDR)]) - (mem:BLK (scratch)) - (match_operand:VI4F_128 5 "register_operand" "1")] - UNSPEC_GATHER)) - (clobber (match_scratch:VI4F_128 1 "=&x"))] + (clobber (match_scratch:VEC_GATHER_MODE 1 "=&x"))] "TARGET_AVX2" - "v<sseintprefix>gatherq<ssemodesuffix>\t{%1, %7, %0|%0, %7, %1}" + "v<sseintprefix>gatherq<ssemodesuffix>\t{%5, %7, %2|%2, %7, %5}" [(set_attr "type" "ssemov") (set_attr "prefix" "vex") (set_attr "mode" "<sseinsnmode>")]) --- gcc/config/i386/i386.c.jj 2011-10-26 14:19:11.000000000 +0200 +++ gcc/config/i386/i386.c 2011-10-26 14:45:45.000000000 +0200 @@ -25060,6 +25060,13 @@ enum ix86_builtins IX86_BUILTIN_GATHERDIV4SI, IX86_BUILTIN_GATHERDIV8SI, + /* Alternate 4 element gather for the vectorizer where + all operands are 32-byte wide. */ + IX86_BUILTIN_GATHERALTSIV4DF, + IX86_BUILTIN_GATHERALTDIV8SF, + IX86_BUILTIN_GATHERALTSIV4DI, + IX86_BUILTIN_GATHERALTDIV8SI, + /* TFmode support builtins. */ IX86_BUILTIN_INFQ, IX86_BUILTIN_HUGE_VALQ, @@ -26841,6 +26848,22 @@ ix86_init_mmx_sse_builtins (void) V4SI_FTYPE_V4SI_PCINT_V4DI_V4SI_INT, IX86_BUILTIN_GATHERDIV8SI); + def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4df ", + V4DF_FTYPE_V4DF_PCDOUBLE_V8SI_V4DF_INT, + IX86_BUILTIN_GATHERALTSIV4DF); + + def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4sf256 ", + V8SF_FTYPE_V8SF_PCFLOAT_V4DI_V8SF_INT, + IX86_BUILTIN_GATHERALTDIV8SF); + + def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4di ", + V4DI_FTYPE_V4DI_PCINT64_V8SI_V4DI_INT, + IX86_BUILTIN_GATHERALTSIV4DI); + + def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4si256 ", + V8SI_FTYPE_V8SI_PCINT_V4DI_V8SI_INT, + IX86_BUILTIN_GATHERALTDIV8SI); + /* MMX access to the vec_init patterns. */ def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v2si", V2SI_FTYPE_INT_INT, IX86_BUILTIN_VEC_INIT_V2SI); @@ -28827,7 +28850,7 @@ rdrand_step: icode = CODE_FOR_avx2_gatherdiv4sf; goto gather_gen; case IX86_BUILTIN_GATHERDIV8SF: - icode = CODE_FOR_avx2_gatherdiv4sf256; + icode = CODE_FOR_avx2_gatherdiv8sf; goto gather_gen; case IX86_BUILTIN_GATHERSIV2DI: icode = CODE_FOR_avx2_gathersiv2di; @@ -28851,7 +28874,20 @@ rdrand_step: icode = CODE_FOR_avx2_gatherdiv4si; goto gather_gen; case IX86_BUILTIN_GATHERDIV8SI: - icode = CODE_FOR_avx2_gatherdiv4si256; + icode = CODE_FOR_avx2_gatherdiv8si; + goto gather_gen; + case IX86_BUILTIN_GATHERALTSIV4DF: + icode = CODE_FOR_avx2_gathersiv4df; + goto gather_gen; + case IX86_BUILTIN_GATHERALTDIV8SF: + icode = CODE_FOR_avx2_gatherdiv8sf; + goto gather_gen; + case IX86_BUILTIN_GATHERALTSIV4DI: + icode = CODE_FOR_avx2_gathersiv4df; + goto gather_gen; + case IX86_BUILTIN_GATHERALTDIV8SI: + icode = CODE_FOR_avx2_gatherdiv8si; + goto gather_gen; gather_gen: arg0 = CALL_EXPR_ARG (exp, 0); @@ -28870,8 +28906,39 @@ rdrand_step: mode3 = insn_data[icode].operand[4].mode; mode4 = insn_data[icode].operand[5].mode; - if (target == NULL_RTX) - target = gen_reg_rtx (insn_data[icode].operand[0].mode); + if (target == NULL_RTX + || GET_MODE (target) != insn_data[icode].operand[0].mode) + subtarget = gen_reg_rtx (insn_data[icode].operand[0].mode); + else + subtarget = target; + + if (fcode == IX86_BUILTIN_GATHERALTSIV4DF + || fcode == IX86_BUILTIN_GATHERALTSIV4DI) + { + rtx half = gen_reg_rtx (V4SImode); + if (!nonimmediate_operand (op2, V8SImode)) + op2 = copy_to_mode_reg (V8SImode, op2); + emit_insn (gen_vec_extract_lo_v8si (half, op2)); + op2 = half; + } + else if (fcode == IX86_BUILTIN_GATHERALTDIV8SF + || fcode == IX86_BUILTIN_GATHERALTDIV8SI) + { + rtx (*gen) (rtx, rtx); + rtx half = gen_reg_rtx (mode0); + if (mode0 == V4SFmode) + gen = gen_vec_extract_lo_v8sf; + else + gen = gen_vec_extract_lo_v8si; + if (!nonimmediate_operand (op0, GET_MODE (op0))) + op0 = copy_to_mode_reg (GET_MODE (op0), op0); + emit_insn (gen (half, op0)); + op0 = half; + if (!nonimmediate_operand (op3, GET_MODE (op3))) + op3 = copy_to_mode_reg (GET_MODE (op3), op3); + emit_insn (gen (half, op3)); + op3 = half; + } /* Force memory operand only with base register here. But we don't want to do it on memory operand for other builtin @@ -28893,10 +28960,26 @@ rdrand_step: error ("last argument must be scale 1, 2, 4, 8"); return const0_rtx; } - pat = GEN_FCN (icode) (target, op0, op1, op2, op3, op4); + pat = GEN_FCN (icode) (subtarget, op0, op1, op2, op3, op4); if (! pat) return const0_rtx; emit_insn (pat); + + if (fcode == IX86_BUILTIN_GATHERDIV8SF + || fcode == IX86_BUILTIN_GATHERDIV8SI) + { + enum machine_mode tmode = GET_MODE (subtarget) == V8SFmode + ? V4SFmode : V4SImode; + if (target == NULL_RTX) + target = gen_reg_rtx (tmode); + if (tmode == V4SFmode) + emit_insn (gen_vec_extract_lo_v8sf (target, subtarget)); + else + emit_insn (gen_vec_extract_lo_v8si (target, subtarget)); + } + else + target = subtarget; + return target; default: @@ -29491,6 +29574,73 @@ ix86_vectorize_builtin_conversion (unsig return NULL_TREE; } +/* Returns a decl of a function that implements gather load with + memory type MEM_VECTYPE and index type INDEX_VECTYPE and SCALE. + Return NULL_TREE if it is not available. */ + +static tree +ix86_vectorize_builtin_gather (const_tree mem_vectype, + const_tree index_type, int scale) +{ + bool si; + enum ix86_builtins code; + + if (! TARGET_AVX2) + return NULL_TREE; + + if ((TREE_CODE (index_type) != INTEGER_TYPE + && !POINTER_TYPE_P (index_type)) + || (TYPE_MODE (index_type) != SImode + && TYPE_MODE (index_type) != DImode)) + return NULL_TREE; + + if (TYPE_PRECISION (index_type) > POINTER_SIZE) + return NULL_TREE; + + /* v*gather* insn sign extends index to pointer mode. */ + if (TYPE_PRECISION (index_type) < POINTER_SIZE + && TYPE_UNSIGNED (index_type)) + return NULL_TREE; + + if (scale <= 0 + || scale > 8 + || (scale & (scale - 1)) != 0) + return NULL_TREE; + + si = TYPE_MODE (index_type) == SImode; + switch (TYPE_MODE (mem_vectype)) + { + case V2DFmode: + code = si ? IX86_BUILTIN_GATHERSIV2DF : IX86_BUILTIN_GATHERDIV2DF; + break; + case V4DFmode: + code = si ? IX86_BUILTIN_GATHERALTSIV4DF : IX86_BUILTIN_GATHERDIV4DF; + break; + case V2DImode: + code = si ? IX86_BUILTIN_GATHERSIV2DI : IX86_BUILTIN_GATHERDIV2DI; + break; + case V4DImode: + code = si ? IX86_BUILTIN_GATHERALTSIV4DI : IX86_BUILTIN_GATHERDIV4DI; + break; + case V4SFmode: + code = si ? IX86_BUILTIN_GATHERSIV4SF : IX86_BUILTIN_GATHERDIV4SF; + break; + case V8SFmode: + code = si ? IX86_BUILTIN_GATHERSIV8SF : IX86_BUILTIN_GATHERALTDIV8SF; + break; + case V4SImode: + code = si ? IX86_BUILTIN_GATHERSIV4SI : IX86_BUILTIN_GATHERDIV4SI; + break; + case V8SImode: + code = si ? IX86_BUILTIN_GATHERSIV8SI : IX86_BUILTIN_GATHERALTDIV8SI; + break; + default: + return NULL_TREE; + } + + return ix86_builtins[code]; +} + /* Returns a code for a target-specific builtin that implements reciprocal of the function, or NULL_TREE if not available. */ @@ -37693,6 +37843,9 @@ ix86_autovectorize_vector_sizes (void) #undef TARGET_VECTORIZE_BUILTIN_CONVERSION #define TARGET_VECTORIZE_BUILTIN_CONVERSION ix86_vectorize_builtin_conversion +#undef TARGET_VECTORIZE_BUILTIN_GATHER +#define TARGET_VECTORIZE_BUILTIN_GATHER ix86_vectorize_builtin_gather + #undef TARGET_BUILTIN_RECIPROCAL #define TARGET_BUILTIN_RECIPROCAL ix86_builtin_reciprocal --- gcc/testsuite/gcc.target/i386/avx2-gather-1.c.jj 2011-10-26 17:19:03.000000000 +0200 +++ gcc/testsuite/gcc.target/i386/avx2-gather-1.c 2011-10-26 18:29:46.000000000 +0200 @@ -0,0 +1,215 @@ +/* { dg-do run } */ +/* { dg-require-effective-target avx2 } */ +/* { dg-options "-O3 -mavx2" } */ + +#include "avx2-check.h" + +#define N 1024 +float vf1[N+16], vf2[N]; +double vd1[N+16], vd2[N]; +int k[N]; +long l[N]; +short n[N]; + +__attribute__((noinline, noclone)) void +f1 (void) +{ + int i; + for (i = 0; i < N; i++) + vf2[i] = vf1[k[i]]; +} + +__attribute__((noinline, noclone)) void +f2 (void) +{ + int i; + for (i = 0; i < N; i++) + n[i] = (int) vf1[k[i]]; +} + +__attribute__((noinline, noclone)) void +f3 (int x) +{ + int i; + for (i = 0; i < N; i++) + vf2[i] = vf1[k[i] + x]; +} + +__attribute__((noinline, noclone)) void +f4 (int x) +{ + int i; + for (i = 0; i < N; i++) + n[i] = (int) vf1[k[i] + x]; +} + +__attribute__((noinline, noclone)) void +f5 (void) +{ + int i; + for (i = 0; i < N; i++) + vd2[i] = vd1[k[i]]; +} + +__attribute__((noinline, noclone)) void +f6 (void) +{ + int i; + for (i = 0; i < N; i++) + n[i] = (int) vd1[k[i]]; +} + +__attribute__((noinline, noclone)) void +f7 (int x) +{ + int i; + for (i = 0; i < N; i++) + vd2[i] = vd1[k[i] + x]; +} + +__attribute__((noinline, noclone)) void +f8 (int x) +{ + int i; + for (i = 0; i < N; i++) + n[i] = (int) vd1[k[i] + x]; +} + +__attribute__((noinline, noclone)) void +f9 (void) +{ + int i; + for (i = 0; i < N; i++) + vf2[i] = vf1[l[i]]; +} + +__attribute__((noinline, noclone)) void +f10 (void) +{ + int i; + for (i = 0; i < N; i++) + n[i] = (int) vf1[l[i]]; +} + +__attribute__((noinline, noclone)) void +f11 (long x) +{ + int i; + for (i = 0; i < N; i++) + vf2[i] = vf1[l[i] + x]; +} + +__attribute__((noinline, noclone)) void +f12 (long x) +{ + int i; + for (i = 0; i < N; i++) + n[i] = (int) vf1[l[i] + x]; +} + +__attribute__((noinline, noclone)) void +f13 (void) +{ + int i; + for (i = 0; i < N; i++) + vd2[i] = vd1[l[i]]; +} + +__attribute__((noinline, noclone)) void +f14 (void) +{ + int i; + for (i = 0; i < N; i++) + n[i] = (int) vd1[l[i]]; +} + +__attribute__((noinline, noclone)) void +f15 (long x) +{ + int i; + for (i = 0; i < N; i++) + vd2[i] = vd1[l[i] + x]; +} + +__attribute__((noinline, noclone)) void +f16 (long x) +{ + int i; + for (i = 0; i < N; i++) + n[i] = (int) vd1[l[i] + x]; +} + +static void +avx2_test (void) +{ + int i; + + for (i = 0; i < N + 16; i++) + { + asm (""); + vf1[i] = 17.0f + i; + vd1[i] = 19.0 + i; + } + for (i = 0; i < N; i++) + { + asm (""); + k[i] = (i * 731) & (N - 1); + l[i] = (i * 657) & (N - 1); + } + + f1 (); + f2 (); + for (i = 0; i < N; i++) + if (vf2[i] != ((i * 731) & (N - 1)) + 17 + || n[i] != ((i * 731) & (N - 1)) + 17) + abort (); + + f3 (12); + f4 (14); + for (i = 0; i < N; i++) + if (vf2[i] != ((i * 731) & (N - 1)) + 17 + 12 + || n[i] != ((i * 731) & (N - 1)) + 17 + 14) + abort (); + + f5 (); + f6 (); + for (i = 0; i < N; i++) + if (vd2[i] != ((i * 731) & (N - 1)) + 19 + || n[i] != ((i * 731) & (N - 1)) + 19) + abort (); + + f7 (7); + f8 (9); + for (i = 0; i < N; i++) + if (vd2[i] != ((i * 731) & (N - 1)) + 19 + 7 + || n[i] != ((i * 731) & (N - 1)) + 19 + 9) + abort (); + + f9 (); + f10 (); + for (i = 0; i < N; i++) + if (vf2[i] != ((i * 657) & (N - 1)) + 17 + || n[i] != ((i * 657) & (N - 1)) + 17) + abort (); + + f11 (2); + f12 (4); + for (i = 0; i < N; i++) + if (vf2[i] != ((i * 657) & (N - 1)) + 17 + 2 + || n[i] != ((i * 657) & (N - 1)) + 17 + 4) + abort (); + + f13 (); + f14 (); + for (i = 0; i < N; i++) + if (vd2[i] != ((i * 657) & (N - 1)) + 19 + || n[i] != ((i * 657) & (N - 1)) + 19) + abort (); + + f15 (13); + f16 (15); + for (i = 0; i < N; i++) + if (vd2[i] != ((i * 657) & (N - 1)) + 19 + 13 + || n[i] != ((i * 657) & (N - 1)) + 19 + 15) + abort (); +} --- gcc/testsuite/gcc.target/i386/avx2-gather-2.c.jj 2011-10-26 17:36:38.000000000 +0200 +++ gcc/testsuite/gcc.target/i386/avx2-gather-2.c 2011-10-26 17:38:31.000000000 +0200 @@ -0,0 +1,7 @@ +/* { dg-do compile } */ +/* { dg-options "-O3 -mavx2 -fdump-tree-vect-details" } */ + +#include "avx2-gather-1.c" + +/* { dg-final { scan-tree-dump-times "note: vectorized 1 loops in function" 16 "vect" } } */ +/* { dg-final { cleanup-tree-dump "vect" } } */ --- gcc/testsuite/gcc.target/i386/avx2-gather-3.c.jj 2011-10-26 18:24:43.000000000 +0200 +++ gcc/testsuite/gcc.target/i386/avx2-gather-3.c 2011-10-26 18:29:36.000000000 +0200 @@ -0,0 +1,167 @@ +/* { dg-do run } */ +/* { dg-require-effective-target avx2 } */ +/* { dg-options "-O3 -mavx2 -ffast-math" } */ + +#include "avx2-check.h" + +#define N 1024 +float f[N]; +double d[N]; +int k[N]; +float *l[N]; +double *n[N]; +int **m[N]; +long **o[N]; +long q[N]; +long *r[N]; +int *s[N]; + +__attribute__((noinline, noclone)) float +f1 (void) +{ + int i; + float g = 0.0; + for (i = 0; i < N / 2; i++) + g += f[k[i]]; + return g; +} + +__attribute__((noinline, noclone)) float +f2 (float *p) +{ + int i; + float g = 0.0; + for (i = 0; i < N / 2; i++) + g += p[k[i]]; + return g; +} + +__attribute__((noinline, noclone)) float +f3 (void) +{ + int i; + float g = 0.0; + for (i = 0; i < N / 2; i++) + g += *l[i]; + return g; +} + +__attribute__((noinline, noclone)) int +f4 (void) +{ + int i; + int g = 0; + for (i = 0; i < N / 2; i++) + g += **m[i]; + return g; +} + +__attribute__((noinline, noclone)) double +f5 (void) +{ + int i; + double g = 0.0; + for (i = 0; i < N / 2; i++) + g += d[k[i]]; + return g; +} + +__attribute__((noinline, noclone)) double +f6 (double *p) +{ + int i; + double g = 0.0; + for (i = 0; i < N / 2; i++) + g += p[k[i]]; + return g; +} + +__attribute__((noinline, noclone)) double +f7 (void) +{ + int i; + double g = 0.0; + for (i = 0; i < N / 2; i++) + g += *n[i]; + return g; +} + +__attribute__((noinline, noclone)) int +f8 (void) +{ + int i; + int g = 0; + for (i = 0; i < N / 2; i++) + g += **o[i]; + return g; +} + +__attribute__((noinline, noclone)) float +f9 (void) +{ + int i; + float g = 0.0; + for (i = 0; i < N / 2; i++) + g += f[q[i]]; + return g; +} + +__attribute__((noinline, noclone)) float +f10 (float *p) +{ + int i; + float g = 0.0; + for (i = 0; i < N / 2; i++) + g += p[q[i]]; + return g; +} + +__attribute__((noinline, noclone)) double +f11 (void) +{ + int i; + double g = 0.0; + for (i = 0; i < N / 2; i++) + g += d[q[i]]; + return g; +} + +__attribute__((noinline, noclone)) double +f12 (double *p) +{ + int i; + double g = 0.0; + for (i = 0; i < N / 2; i++) + g += p[q[i]]; + return g; +} + +static void +avx2_test (void) +{ + int i; + + for (i = 0; i < N; i++) + { + asm (""); + f[i] = -256.0f + i; + d[i] = -258.0 + i; + k[i] = (i * 731) & (N - 1); + q[i] = (i * 657) & (N - 1); + l[i] = &f[(i * 239) & (N - 1)]; + n[i] = &d[(i * 271) & (N - 1)]; + r[i] = &q[(i * 323) & (N - 1)]; + s[i] = &k[(i * 565) & (N - 1)]; + m[i] = &s[(i * 13) & (N - 1)]; + o[i] = &r[(i * 19) & (N - 1)]; + } + + if (f1 () != 136448.0f || f2 (f) != 136448.0f || f3 () != 130304.0) + abort (); + if (f4 () != 261376 || f5 () != 135424.0 || f6 (d) != 135424.0) + abort (); + if (f7 () != 129280.0 || f8 () != 259840L || f9 () != 130816.0f) + abort (); + if (f10 (f) != 130816.0f || f11 () != 129792.0 || f12 (d) != 129792.0) + abort (); +} Jakub