Here is the prototype for doing vectorized reduction using SLP approach. I would appreciate feedback if this is a feasible approach and if overall the direction is right.
The idea is to vectorize reduction like this S = A[0]+A[1]+...A[N]; into Sv = Av[0]+Av[1]+...+Av[N/VL]; So that, for instance, the following code: typedef double T; T sum; void foo (T* __restrict__ a) { sum = a[0]+ a[1] + a[2]+ a[3] + a[4]+ a[5] + a[6]+ a[7]; } instead of: foo: .LFB23: .cfi_startproc movsd (%rdi), %xmm0 movsd 16(%rdi), %xmm1 addsd 8(%rdi), %xmm0 addsd 24(%rdi), %xmm1 addsd %xmm1, %xmm0 movsd 32(%rdi), %xmm1 addsd 40(%rdi), %xmm1 addsd %xmm1, %xmm0 movsd 48(%rdi), %xmm1 addsd 56(%rdi), %xmm1 addsd %xmm1, %xmm0 movsd %xmm0, sum2(%rip) ret .cfi_endproc be compiled into: foo: .LFB11: .cfi_startproc movupd 32(%rdi), %xmm0 movupd 48(%rdi), %xmm3 movupd (%rdi), %xmm1 movupd 16(%rdi), %xmm2 addpd %xmm3, %xmm0 addpd %xmm2, %xmm1 addpd %xmm1, %xmm0 haddpd %xmm0, %xmm0 movlpd %xmm0, sum(%rip) ret .cfi_endproc As this is a very crude prototype there are some things to consider. 1. As the current SLP framework assumes presence of group stores I cannot use directly it as reduction does not require group stores (or even stores at all), so, I'm partially using the existing functionality but sometimes I have to create a stripped down version of it for my own needs; 2. The current version considers only PLUS reduction as it is encountered most often and therefore is the most practical; 3. While normally SLP transformation should operate inside single basic block this requirement greatly restricts it's practical application as in a code complex enough there will be vectorizable subexpressions defined in basic block(s) different from that where the reduction result resides. However, for the sake of simplicity only single uses in the same block are considered now; 4. For the same sake the current version does not deal with partial reductions which would require partial sum merging and careful removal of the scalars that participate in the vector part. The latter gets done automatically by DCE in the case of full reduction vectorization; 5. There is no cost model yet for the reasons mentioned in the paragraphs 3 and 4. Thanks in advance. -- Anton
>From eb2644765d68ef1c629e584086355a8d66df7c73 Mon Sep 17 00:00:00 2001 From: Anton Youdkevitch <anton.youdkevi...@bell-sw.com> Date: Fri, 9 Nov 2018 20:50:05 +0300 Subject: [PATCH] WIP BB-only SLP reduction Very basic effort to implement SLP vectorization for reductions. --- gcc/tree-vect-slp.c | 313 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 313 insertions(+) diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c index 0ab7bd8086c..14c7a7e8069 100644 --- a/gcc/tree-vect-slp.c +++ b/gcc/tree-vect-slp.c @@ -2913,6 +2913,317 @@ vect_slp_analyze_bb_1 (gimple_stmt_iterator region_begin, return bb_vinfo; } +#include "alias.h" +#include "tree-into-ssa.h" + +static tree +create_vector_load(stmt_vec_info info, gimple_stmt_iterator *gsi) +{ + tree dataref_ptr = NULL_TREE; + tree ref_type; + tree offset = NULL_TREE; + tree byte_offset = NULL_TREE; + gimple *ptr_incr = NULL; + tree dummy; + tree data_ref = NULL; + + tree vectype = STMT_VINFO_VECTYPE (info); + tree scalar_dest = gimple_assign_lhs (info->stmt); + + data_reference *dr = STMT_VINFO_DATA_REF (info); + dr_vec_info *dr_info = STMT_VINFO_DR_INFO (info); + ref_type = reference_alias_ptr_type (DR_REF (dr)); + tree bump = size_zero_node; + if (DR_BASE_ALIGNMENT (dr) < TYPE_ALIGN (vectype)) + { + vectype = build_aligned_type + (vectype, DR_BASE_ALIGNMENT (dr) * BITS_PER_UNIT); + } + /* manually set the misalignment as + it is uninitialized at this moment */ + SET_DR_MISALIGNMENT (dr_info, 0); + DR_TARGET_ALIGNMENT (dr_info) = DR_BASE_ALIGNMENT (dr); + + dataref_ptr = vect_create_data_ref_ptr (info, vectype, NULL, + offset, &dummy, gsi, &ptr_incr, + false, byte_offset, bump); + data_ref + = fold_build2 (MEM_REF, vectype, dataref_ptr, + build_int_cst (ref_type, 0)); + tree vdst = vect_create_destination_var (scalar_dest, vectype); + tree new_name = make_ssa_name (vdst, NULL); + gassign* new_stmt = gimple_build_assign (new_name, data_ref); + vect_finish_stmt_generation (info, new_stmt, gsi); + return new_name; +} + + +/* Blatantly taken from tree-vect-data-refs.c */ + +static int +dr_group_sort_cmp (const void *dra_, const void *drb_) +{ + data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_); + data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_); + int cmp; + + /* Stabilize sort. */ + if (dra == drb) + return 0; + + /* DRs in different loops never belong to the same group. */ + loop_p loopa = gimple_bb (DR_STMT (dra))->loop_father; + loop_p loopb = gimple_bb (DR_STMT (drb))->loop_father; + if (loopa != loopb) + return loopa->num < loopb->num ? -1 : 1; + + /* Ordering of DRs according to base. */ + cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra), + DR_BASE_ADDRESS (drb)); + if (cmp != 0) + return cmp; + + /* And according to DR_OFFSET. */ + cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)); + if (cmp != 0) + return cmp; + + /* Put reads before writes. */ + if (DR_IS_READ (dra) != DR_IS_READ (drb)) + return DR_IS_READ (dra) ? -1 : 1; + + /* Then sort after access size. */ + cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))), + TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)))); + if (cmp != 0) + return cmp; + + /* And after step. */ + cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)); + if (cmp != 0) + return cmp; + + /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */ + cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb)); + if (cmp == 0) + return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1; + + return cmp; +} + + +/* + Walk down the reduction chain until leaf is found + that is load operaion or non-assign operation is + encountered. +*/ + +static void +walk_redu_chain (gimple* stmt, auto_vec<gimple*> *chain, + hash_set<gimple*> *visited) +{ + visited->add (stmt); + + if (! is_gimple_assign (stmt)) + return; + + enum tree_code code = gimple_assign_rhs_code (stmt); + + if (gimple_assign_load_p (stmt)) + { + chain->safe_push (stmt); + return; + } + else if (code == PLUS_EXPR) + { + ssa_op_iter iter; + tree opnd; + gimple* st; + + FOR_EACH_SSA_TREE_OPERAND (opnd, stmt, iter, SSA_OP_USE) + { + if (!has_single_use (opnd)) + continue; + + st = SSA_NAME_DEF_STMT (opnd); + walk_redu_chain (st, chain, visited); + } + } + + return; +} + +static bool +vect_slp_reduc (basic_block bb) +{ + bool any_vectorized = false; + gimple_stmt_iterator gsi; + + DUMP_VECT_SCOPE ("vect_slp_analyze_reduction"); + + for (gsi = gsi_last_bb (bb); !gsi_end_p (gsi); gsi_prev (&gsi)) { + gimple *stmt = gsi_stmt (gsi); + auto_vec<gimple*> rchain; + hash_set<gimple *> visited; + + if (rchain.length () != 0) + rchain.release (); + + if (is_gimple_assign (stmt) && + gimple_assign_rhs_code (stmt) == PLUS_EXPR && + !VECTOR_TYPE_P (TREE_TYPE (gimple_assign_lhs (stmt))) && + !visited.contains (stmt) + ) + { + + walk_redu_chain (stmt, &rchain, &visited); + + /* there is no cost model but less than 4 elements + in the chain are pointless to consider anyway */ + if (rchain.length () < 4) + continue; + + } + else + { + continue; + } + + vec<data_reference_p> datarefs = vNULL; + if (datarefs != vNULL) + { + free_data_refs (datarefs); + } + + { + gimple* stmt; + unsigned int i; + + FOR_EACH_VEC_ELT (rchain, i, stmt) + { + vect_find_stmt_data_reference (NULL, stmt, &datarefs); + } + } + + /* this is crude but since the stms chain gets updated + passing the actual region is not possible */ + gimple_stmt_iterator rend = gsi_last_nondebug_bb (bb); + gimple_stmt_iterator rbegin = gsi_start_nondebug_after_labels_bb (bb); + vec_info_shared shared; + bb_vec_info bb_vinfo = new _bb_vec_info (rbegin, rend, &shared); + + BB_VINFO_DATAREFS (bb_vinfo) = datarefs; + bb_vinfo->shared->save_datarefs (); + poly_uint64 min_vf = 2; + bool ok = false; + + ok = vect_analyze_data_refs (bb_vinfo, &min_vf); + if (ok) ok = vect_analyze_data_ref_accesses (bb_vinfo); + + vect_record_base_alignments (bb_vinfo); + + auto_vec<data_reference_p> vld; + data_reference_p head = NULL, dr; + tree scalar_type = TREE_TYPE (gimple_get_lhs (rchain[0])); + tree vectype = get_vectype_for_scalar_type (scalar_type); + + poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype); + unsigned int veclen = nunits.to_constant (); + unsigned elt_size = + tree_to_uhwi (TYPE_SIZE_UNIT (gimple_expr_type (stmt))); + unsigned int elt_idx = 0; + unsigned int i; + datarefs.qsort (dr_group_sort_cmp); + FOR_EACH_VEC_ELT (datarefs, i, dr) + { + if (! head) + { + head = dr; + elt_idx = 1; + } + else + { + poly_offset_int distance = + wi::to_poly_offset (DR_INIT (dr)) - + wi::to_poly_offset (DR_INIT (head)); + + int cmp = (0 == data_ref_compare_tree (DR_BASE_ADDRESS (dr), + DR_BASE_ADDRESS (head))); + + if (cmp && known_eq (distance, (elt_idx * elt_size))) + { + elt_idx += 1; + if (elt_idx == veclen) + { + vld.safe_push (head); + head = NULL; + elt_idx = 0; + } + } + else + { + head = dr; + elt_idx = 1; + } + } + } + if (vld.length () > 1) + { + any_vectorized = true; + tree vec_dest; + gimple *before_store = stmt; + tree scalar_dest = gimple_get_lhs (before_store); + gimple_stmt_iterator gsi = gsi_for_stmt (stmt); + auto_vec<tree> vects; + + if (vld.length () > 1) + { + unsigned int i; + FOR_EACH_VEC_ELT (vld, i, dr) + { + stmt_vec_info info = bb_vinfo->lookup_stmt (dr->stmt); + vects.safe_push (create_vector_load(info, &gsi)); + } + } + + tree vect, prev = NULL; + FOR_EACH_VEC_ELT (vects, i, vect) + { + if (! prev) + { + prev = vect; + continue; + } + + tree vec_dest = vect_create_destination_var (scalar_dest, vectype); + tree new_name = make_ssa_name (vec_dest); + gimple* new_stmt = gimple_build_assign (new_name, PLUS_EXPR, + prev, vect); + gsi_insert_before (&gsi, new_stmt, GSI_SAME_STMT); + prev = new_name; + } + + /* the last vector sum is prev, hand it to the reduction */ + vec_dest = prev; + tree scalar_dest1 = vect_create_destination_var (scalar_dest, 0); + gimple* epilog_stmt = gimple_build_call_internal + (IFN_REDUC_PLUS, 1, vec_dest); + + tree reduc_temp = make_ssa_name (scalar_dest1, epilog_stmt); + gimple_set_lhs (epilog_stmt, reduc_temp); + gsi_insert_before (&gsi, epilog_stmt, GSI_SAME_STMT); + tree result = gimple_assign_lhs (before_store); + gassign* res_stmt = gimple_build_assign (result, reduc_temp); + gsi_insert_before (&gsi, res_stmt, GSI_SAME_STMT); + gsi_remove (&gsi, true); + update_stmt (res_stmt); + } + vld.release (); + delete bb_vinfo; + } + return any_vectorized; +} + /* Main entry for the BB vectorizer. Analyze and transform BB, returns true if anything in the basic-block was vectorized. */ @@ -2925,6 +3236,8 @@ vect_slp_bb (basic_block bb) bool any_vectorized = false; auto_vector_sizes vector_sizes; + any_vectorized = vect_slp_reduc (bb); + DUMP_VECT_SCOPE ("vect_slp_analyze_bb"); /* Autodetect first vector size we try. */ -- 2.17.1