The following paves the way for more supported load permutations. I've split it off the patch supporting more permutation as that requires a load of surgery still :/
Bootstrapped and tested on x86_64-unknown-linux-gnu, applied. Richard. 2013-04-19 Richard Biener <rguent...@suse.de> * tree-vectorizer.h (struct _slp_instance): Move load_permutation member ... (struct _slp_tree): ... here. Make it a vector of unsigned ints. (SLP_INSTANCE_LOAD_PERMUTATION): Remove. (SLP_TREE_LOAD_PERMUTATION): Add. (vect_transform_slp_perm_load): Adjust prototype. * tree-vect-slp.c (vect_free_slp_tree): Adjust. (vect_free_slp_instance): Likewise. (vect_create_new_slp_node): Likewise. (vect_supported_slp_permutation_p): Remove. (vect_slp_rearrange_stmts): Adjust. (vect_supported_load_permutation_p): Likewise. Inline vect_supported_slp_permutation_p here. (vect_analyze_slp_instance): Compute load permutations per slp node instead of per instance. (vect_get_slp_defs): Adjust. (vect_transform_slp_perm_load): Likewise. (vect_schedule_slp_instance): Remove redundant code. (vect_schedule_slp): Remove hack for PR56270, add it ... * tree-vect-stmts.c (vectorizable_load): ... here, do not CSE loads for SLP. Adjust. Index: trunk/gcc/tree-vect-slp.c =================================================================== *** trunk.orig/gcc/tree-vect-slp.c 2013-04-19 12:43:20.000000000 +0200 --- trunk/gcc/tree-vect-slp.c 2013-04-19 13:04:29.317524077 +0200 *************** vect_free_slp_tree (slp_tree node) *** 78,83 **** --- 78,84 ---- SLP_TREE_CHILDREN (node).release (); SLP_TREE_SCALAR_STMTS (node).release (); SLP_TREE_VEC_STMTS (node).release (); + SLP_TREE_LOAD_PERMUTATION (node).release (); free (node); } *************** void *** 89,95 **** vect_free_slp_instance (slp_instance instance) { vect_free_slp_tree (SLP_INSTANCE_TREE (instance)); - SLP_INSTANCE_LOAD_PERMUTATION (instance).release (); SLP_INSTANCE_LOADS (instance).release (); SLP_INSTANCE_BODY_COST_VEC (instance).release (); free (instance); --- 90,95 ---- *************** vect_create_new_slp_node (vec<gimple> sc *** 120,125 **** --- 120,126 ---- SLP_TREE_SCALAR_STMTS (node) = scalar_stmts; SLP_TREE_VEC_STMTS (node).create (0); SLP_TREE_CHILDREN (node).create (nops); + SLP_TREE_LOAD_PERMUTATION (node) = vNULL; return node; } *************** vect_mark_slp_stmts_relevant (slp_tree n *** 1026,1098 **** } - /* Check if the permutation required by the SLP INSTANCE is supported. - Reorganize the SLP nodes stored in SLP_INSTANCE_LOADS if needed. */ - - static bool - vect_supported_slp_permutation_p (slp_instance instance) - { - slp_tree node = SLP_INSTANCE_LOADS (instance)[0]; - gimple stmt = SLP_TREE_SCALAR_STMTS (node)[0]; - gimple first_load = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)); - vec<slp_tree> sorted_loads = vNULL; - int index; - slp_tree *tmp_loads = NULL; - int group_size = SLP_INSTANCE_GROUP_SIZE (instance), i, j; - slp_tree load; - - /* FORNOW: The only supported loads permutation is loads from the same - location in all the loads in the node, when the data-refs in - nodes of LOADS constitute an interleaving chain. - Sort the nodes according to the order of accesses in the chain. */ - tmp_loads = (slp_tree *) xmalloc (sizeof (slp_tree) * group_size); - for (i = 0, j = 0; - SLP_INSTANCE_LOAD_PERMUTATION (instance).iterate (i, &index) - && SLP_INSTANCE_LOADS (instance).iterate (j, &load); - i += group_size, j++) - { - gimple scalar_stmt = SLP_TREE_SCALAR_STMTS (load)[0]; - /* Check that the loads are all in the same interleaving chain. */ - if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (scalar_stmt)) != first_load) - { - if (dump_enabled_p ()) - { - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "Build SLP failed: unsupported data " - "permutation "); - dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, - scalar_stmt, 0); - } - - free (tmp_loads); - return false; - } - - tmp_loads[index] = load; - } - - sorted_loads.create (group_size); - for (i = 0; i < group_size; i++) - sorted_loads.safe_push (tmp_loads[i]); - - SLP_INSTANCE_LOADS (instance).release (); - SLP_INSTANCE_LOADS (instance) = sorted_loads; - free (tmp_loads); - - if (!vect_transform_slp_perm_load (stmt, vNULL, NULL, - SLP_INSTANCE_UNROLLING_FACTOR (instance), - instance, true)) - return false; - - return true; - } - - /* Rearrange the statements of NODE according to PERMUTATION. */ static void vect_slp_rearrange_stmts (slp_tree node, unsigned int group_size, ! vec<int> permutation) { gimple stmt; vec<gimple> tmp_stmts; --- 1027,1037 ---- } /* Rearrange the statements of NODE according to PERMUTATION. */ static void vect_slp_rearrange_stmts (slp_tree node, unsigned int group_size, ! vec<unsigned> permutation) { gimple stmt; vec<gimple> tmp_stmts; *************** vect_slp_rearrange_stmts (slp_tree node, *** 1114,1145 **** } ! /* Check if the required load permutation is supported. ! LOAD_PERMUTATION contains a list of indices of the loads. ! In SLP this permutation is relative to the order of grouped stores that are ! the base of the SLP instance. */ static bool ! vect_supported_load_permutation_p (slp_instance slp_instn, int group_size, ! vec<int> load_permutation) { ! int i = 0, j, prev = -1, next, k, number_of_groups; ! bool supported, bad_permutation = false; sbitmap load_index; slp_tree node; gimple stmt, load, next_load, first_load; struct data_reference *dr; - bb_vec_info bb_vinfo; - - /* FORNOW: permutations are only supported in SLP. */ - if (!slp_instn) - return false; if (dump_enabled_p ()) { dump_printf_loc (MSG_NOTE, vect_location, "Load permutation "); ! FOR_EACH_VEC_ELT (load_permutation, i, next) ! dump_printf (MSG_NOTE, "%d ", next); } /* In case of reduction every load permutation is allowed, since the order --- 1053,1081 ---- } ! /* Check if the required load permutations in the SLP instance ! SLP_INSTN are supported. */ static bool ! vect_supported_load_permutation_p (slp_instance slp_instn) { ! unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_instn); ! unsigned int i, j, k, next; sbitmap load_index; slp_tree node; gimple stmt, load, next_load, first_load; struct data_reference *dr; if (dump_enabled_p ()) { dump_printf_loc (MSG_NOTE, vect_location, "Load permutation "); ! FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node) ! if (node->load_permutation.exists ()) ! FOR_EACH_VEC_ELT (node->load_permutation, j, next) ! dump_printf (MSG_NOTE, "%d ", next); ! else ! for (i = 0; i < group_size; ++i) ! dump_printf (MSG_NOTE, "%d ", i); } /* In case of reduction every load permutation is allowed, since the order *************** vect_supported_load_permutation_p (slp_i *** 1150,1358 **** permutation). */ /* Check that all the load nodes are of the same size. */ FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node) if (SLP_TREE_SCALAR_STMTS (node).length () != (unsigned) group_size) return false; node = SLP_INSTANCE_TREE (slp_instn); stmt = SLP_TREE_SCALAR_STMTS (node)[0]; - /* LOAD_PERMUTATION is a list of indices of all the loads of the SLP - instance, not all the loads belong to the same node or interleaving - group. Hence, we need to divide them into groups according to - GROUP_SIZE. */ - number_of_groups = load_permutation.length () / group_size; /* Reduction (there are no data-refs in the root). In reduction chain the order of the loads is important. */ if (!STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)) && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))) { ! int first_group_load_index; ! ! /* Compare all the permutation sequences to the first one. */ ! for (i = 1; i < number_of_groups; i++) ! { ! k = 0; ! for (j = i * group_size; j < i * group_size + group_size; j++) ! { ! next = load_permutation[j]; ! first_group_load_index = load_permutation[k]; ! ! if (next != first_group_load_index) ! { ! bad_permutation = true; ! break; ! } ! k++; ! } ! ! if (bad_permutation) ! break; ! } ! ! if (!bad_permutation) ! { ! /* Check that the loads in the first sequence are different and there ! are no gaps between them. */ ! load_index = sbitmap_alloc (group_size); ! bitmap_clear (load_index); ! for (k = 0; k < group_size; k++) ! { ! first_group_load_index = load_permutation[k]; ! if (bitmap_bit_p (load_index, first_group_load_index)) ! { ! bad_permutation = true; ! break; ! } ! ! bitmap_set_bit (load_index, first_group_load_index); ! } ! ! if (!bad_permutation) ! for (k = 0; k < group_size; k++) ! if (!bitmap_bit_p (load_index, k)) ! { ! bad_permutation = true; ! break; ! } ! sbitmap_free (load_index); ! } ! if (!bad_permutation) ! { ! /* This permutation is valid for reduction. Since the order of the ! statements in the nodes is not important unless they are memory ! accesses, we can rearrange the statements in all the nodes ! according to the order of the loads. */ ! vect_slp_rearrange_stmts (SLP_INSTANCE_TREE (slp_instn), group_size, ! load_permutation); ! SLP_INSTANCE_LOAD_PERMUTATION (slp_instn).release (); ! return true; ! } } /* In basic block vectorization we allow any subchain of an interleaving chain. FORNOW: not supported in loop SLP because of realignment compications. */ ! bb_vinfo = STMT_VINFO_BB_VINFO (vinfo_for_stmt (stmt)); ! bad_permutation = false; ! /* Check that for every node in the instance the loads form a subchain. */ ! if (bb_vinfo) { FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node) { next_load = NULL; - first_load = NULL; FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load) { - if (!first_load) - first_load = GROUP_FIRST_ELEMENT (vinfo_for_stmt (load)); - else if (first_load - != GROUP_FIRST_ELEMENT (vinfo_for_stmt (load))) - { - bad_permutation = true; - break; - } - if (j != 0 && next_load != load) ! { ! bad_permutation = true; ! break; ! } ! next_load = GROUP_NEXT_ELEMENT (vinfo_for_stmt (load)); } - - if (bad_permutation) - break; } /* Check that the alignment of the first load in every subchain, i.e., ! the first statement in every load node, is supported. */ ! if (!bad_permutation) ! { ! FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node) ! { ! first_load = SLP_TREE_SCALAR_STMTS (node)[0]; ! if (first_load ! != GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_load))) ! { ! dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_load)); ! if (vect_supportable_dr_alignment (dr, false) ! == dr_unaligned_unsupported) ! { ! if (dump_enabled_p ()) ! { ! dump_printf_loc (MSG_MISSED_OPTIMIZATION, ! vect_location, ! "unsupported unaligned load "); ! dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ! first_load, 0); ! } ! bad_permutation = true; ! break; ! } ! } ! } ! if (!bad_permutation) ! { ! SLP_INSTANCE_LOAD_PERMUTATION (slp_instn).release (); ! return true; ! } ! } } /* FORNOW: the only supported permutation is 0..01..1.. of length equal to GROUP_SIZE and where each sequence of same drs is of GROUP_SIZE length as well (unless it's reduction). */ ! if (load_permutation.length () ! != (unsigned int) (group_size * group_size)) return false; - supported = true; load_index = sbitmap_alloc (group_size); bitmap_clear (load_index); ! for (j = 0; j < group_size; j++) { ! for (i = j * group_size, k = 0; ! load_permutation.iterate (i, &next) && k < group_size; ! i++, k++) ! { ! if (i != j * group_size && next != prev) ! { ! supported = false; ! break; ! } ! ! prev = next; ! } ! ! if (bitmap_bit_p (load_index, prev)) ! { ! supported = false; ! break; ! } ! ! bitmap_set_bit (load_index, prev); } ! ! for (j = 0; j < group_size; j++) ! if (!bitmap_bit_p (load_index, j)) { sbitmap_free (load_index); return false; } - sbitmap_free (load_index); ! if (supported && i == group_size * group_size ! && vect_supported_slp_permutation_p (slp_instn)) ! return true; ! ! return false; } --- 1086,1246 ---- permutation). */ /* Check that all the load nodes are of the same size. */ + /* ??? Can't we assert this? */ FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node) if (SLP_TREE_SCALAR_STMTS (node).length () != (unsigned) group_size) return false; node = SLP_INSTANCE_TREE (slp_instn); stmt = SLP_TREE_SCALAR_STMTS (node)[0]; /* Reduction (there are no data-refs in the root). In reduction chain the order of the loads is important. */ if (!STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)) && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))) { ! slp_tree load; ! unsigned int lidx; ! /* Compare all the permutation sequences to the first one. We know ! that at least one load is permuted. */ ! node = SLP_INSTANCE_LOADS (slp_instn)[0]; ! if (!node->load_permutation.exists ()) ! return false; ! for (i = 1; SLP_INSTANCE_LOADS (slp_instn).iterate (i, &load); ++i) ! { ! if (!load->load_permutation.exists ()) ! return false; ! FOR_EACH_VEC_ELT (load->load_permutation, j, lidx) ! if (lidx != node->load_permutation[j]) ! return false; ! } ! /* Check that the loads in the first sequence are different and there ! are no gaps between them. */ ! load_index = sbitmap_alloc (group_size); ! bitmap_clear (load_index); ! FOR_EACH_VEC_ELT (node->load_permutation, i, lidx) ! { ! if (bitmap_bit_p (load_index, lidx)) ! { ! sbitmap_free (load_index); ! return false; ! } ! bitmap_set_bit (load_index, lidx); ! } ! for (i = 0; i < group_size; i++) ! if (!bitmap_bit_p (load_index, i)) ! { ! sbitmap_free (load_index); ! return false; ! } ! sbitmap_free (load_index); ! ! /* This permutation is valid for reduction. Since the order of the ! statements in the nodes is not important unless they are memory ! accesses, we can rearrange the statements in all the nodes ! according to the order of the loads. */ ! vect_slp_rearrange_stmts (SLP_INSTANCE_TREE (slp_instn), group_size, ! node->load_permutation); ! /* We are done, no actual permutations need to be generated. */ ! FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node) ! SLP_TREE_LOAD_PERMUTATION (node).release (); ! return true; } /* In basic block vectorization we allow any subchain of an interleaving chain. FORNOW: not supported in loop SLP because of realignment compications. */ ! if (STMT_VINFO_BB_VINFO (vinfo_for_stmt (stmt))) { + /* Check that for every node in the instance the loads + form a subchain. */ FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node) { next_load = NULL; FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load) { if (j != 0 && next_load != load) ! return false; next_load = GROUP_NEXT_ELEMENT (vinfo_for_stmt (load)); } } /* Check that the alignment of the first load in every subchain, i.e., ! the first statement in every load node, is supported. ! ??? This belongs in alignment checking. */ ! FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node) ! { ! first_load = SLP_TREE_SCALAR_STMTS (node)[0]; ! if (first_load != GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_load))) ! { ! dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_load)); ! if (vect_supportable_dr_alignment (dr, false) ! == dr_unaligned_unsupported) ! { ! if (dump_enabled_p ()) ! { ! dump_printf_loc (MSG_MISSED_OPTIMIZATION, ! vect_location, ! "unsupported unaligned load "); ! dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ! first_load, 0); ! } ! return false; ! } ! } ! } ! /* We are done, no actual permutations need to be generated. */ ! FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node) ! SLP_TREE_LOAD_PERMUTATION (node).release (); ! return true; } /* FORNOW: the only supported permutation is 0..01..1.. of length equal to GROUP_SIZE and where each sequence of same drs is of GROUP_SIZE length as well (unless it's reduction). */ ! if (SLP_INSTANCE_LOADS (slp_instn).length () != group_size) return false; + FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node) + if (!node->load_permutation.exists ()) + return false; load_index = sbitmap_alloc (group_size); bitmap_clear (load_index); ! FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node) { ! unsigned int lidx = node->load_permutation[0]; ! if (bitmap_bit_p (load_index, lidx)) ! { ! sbitmap_free (load_index); ! return false; ! } ! bitmap_set_bit (load_index, lidx); ! FOR_EACH_VEC_ELT (node->load_permutation, j, k) ! if (k != lidx) ! { ! sbitmap_free (load_index); ! return false; ! } } ! for (i = 0; i < group_size; i++) ! if (!bitmap_bit_p (load_index, i)) { sbitmap_free (load_index); return false; } sbitmap_free (load_index); ! FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node) ! if (node->load_permutation.exists () ! && !vect_transform_slp_perm_load ! (node, vNULL, NULL, ! SLP_INSTANCE_UNROLLING_FACTOR (slp_instn), slp_instn, true)) ! return false; ! return true; } *************** vect_analyze_slp_instance (loop_vec_info *** 1642,1658 **** SLP_INSTANCE_BODY_COST_VEC (new_instance) = vNULL; SLP_INSTANCE_LOADS (new_instance) = loads; SLP_INSTANCE_FIRST_LOAD_STMT (new_instance) = NULL; - SLP_INSTANCE_LOAD_PERMUTATION (new_instance) = vNULL; /* Compute the load permutation. */ slp_tree load_node; bool loads_permuted = false; - vec<int> load_permutation; - load_permutation.create (group_size * group_size); FOR_EACH_VEC_ELT (loads, i, load_node) { int j; gimple load, first_stmt; first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (load_node)[0])); FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load) --- 1530,1546 ---- SLP_INSTANCE_BODY_COST_VEC (new_instance) = vNULL; SLP_INSTANCE_LOADS (new_instance) = loads; SLP_INSTANCE_FIRST_LOAD_STMT (new_instance) = NULL; /* Compute the load permutation. */ slp_tree load_node; bool loads_permuted = false; FOR_EACH_VEC_ELT (loads, i, load_node) { + vec<unsigned> load_permutation; int j; gimple load, first_stmt; + bool this_load_permuted = false; + load_permutation.create (group_size); first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (load_node)[0])); FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load) *************** vect_analyze_slp_instance (loop_vec_info *** 1661,1676 **** = vect_get_place_in_interleaving_chain (load, first_stmt); gcc_assert (load_place != -1); if (load_place != j) ! loads_permuted = true; load_permutation.safe_push (load_place); } } if (loads_permuted) { ! SLP_INSTANCE_LOAD_PERMUTATION (new_instance) = load_permutation; ! if (!vect_supported_load_permutation_p (new_instance, group_size, ! load_permutation)) { if (dump_enabled_p ()) { --- 1549,1569 ---- = vect_get_place_in_interleaving_chain (load, first_stmt); gcc_assert (load_place != -1); if (load_place != j) ! this_load_permuted = true; load_permutation.safe_push (load_place); } + if (!this_load_permuted) + { + load_permutation.release (); + continue; + } + SLP_TREE_LOAD_PERMUTATION (load_node) = load_permutation; + loads_permuted = true; } if (loads_permuted) { ! if (!vect_supported_load_permutation_p (new_instance)) { if (dump_enabled_p ()) { *************** vect_analyze_slp_instance (loop_vec_info *** 1679,1694 **** "permutation "); dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0); } - vect_free_slp_instance (new_instance); return false; } SLP_INSTANCE_FIRST_LOAD_STMT (new_instance) ! = vect_find_first_load_in_slp_instance (new_instance); } - else - load_permutation.release (); /* Compute the costs of this SLP instance. */ vect_analyze_slp_cost (loop_vinfo, bb_vinfo, --- 1572,1584 ---- "permutation "); dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0); } vect_free_slp_instance (new_instance); return false; } SLP_INSTANCE_FIRST_LOAD_STMT (new_instance) ! = vect_find_first_load_in_slp_instance (new_instance); } /* Compute the costs of this SLP instance. */ vect_analyze_slp_cost (loop_vinfo, bb_vinfo, *************** vect_get_slp_defs (vec<tree> ops, slp_tr *** 2653,2659 **** vectorized_defs = false; if (SLP_TREE_CHILDREN (slp_node).length () > child_index) { ! child = (slp_tree) SLP_TREE_CHILDREN (slp_node)[child_index]; /* We have to check both pattern and original def, if available. */ gimple first_def = SLP_TREE_SCALAR_STMTS (child)[0]; --- 2543,2549 ---- vectorized_defs = false; if (SLP_TREE_CHILDREN (slp_node).length () > child_index) { ! child = SLP_TREE_CHILDREN (slp_node)[child_index]; /* We have to check both pattern and original def, if available. */ gimple first_def = SLP_TREE_SCALAR_STMTS (child)[0]; *************** vect_get_mask_element (gimple stmt, int *** 2854,2869 **** /* Generate vector permute statements from a list of loads in DR_CHAIN. If ANALYZE_ONLY is TRUE, only check that it is possible to create valid ! permute statements for SLP_NODE_INSTANCE. */ bool ! vect_transform_slp_perm_load (gimple stmt, vec<tree> dr_chain, gimple_stmt_iterator *gsi, int vf, slp_instance slp_node_instance, bool analyze_only) { stmt_vec_info stmt_info = vinfo_for_stmt (stmt); tree mask_element_type = NULL_TREE, mask_type; int i, j, k, nunits, vec_index = 0, scalar_index; - slp_tree node; tree vectype = STMT_VINFO_VECTYPE (stmt_info); gimple next_scalar_stmt; int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance); --- 2744,2761 ---- /* Generate vector permute statements from a list of loads in DR_CHAIN. If ANALYZE_ONLY is TRUE, only check that it is possible to create valid ! permute statements for the SLP node NODE of the SLP instance ! SLP_NODE_INSTANCE. */ ! bool ! vect_transform_slp_perm_load (slp_tree node, vec<tree> dr_chain, gimple_stmt_iterator *gsi, int vf, slp_instance slp_node_instance, bool analyze_only) { + gimple stmt = SLP_TREE_SCALAR_STMTS (node)[0]; stmt_vec_info stmt_info = vinfo_for_stmt (stmt); tree mask_element_type = NULL_TREE, mask_type; int i, j, k, nunits, vec_index = 0, scalar_index; tree vectype = STMT_VINFO_VECTYPE (stmt_info); gimple next_scalar_stmt; int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance); *************** vect_transform_slp_perm_load (gimple stm *** 2910,2915 **** --- 2802,2810 ---- relatively to SLP_NODE_INSTANCE unrolling factor. */ ncopies = vf / SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance); + if (!STMT_VINFO_GROUPED_ACCESS (stmt_info)) + return false; + /* Generate permutation masks for every NODE. Number of masks for each NODE is equal to GROUP_SIZE. E.g., we have a group of three nodes with three loads from the same *************** vect_transform_slp_perm_load (gimple stm *** 2928,2934 **** we need the second and the third vectors: {b1,c1,a2,b2} and {c2,a3,b3,c3}. */ - FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_node_instance), i, node) { scalar_index = 0; index = 0; --- 2823,2828 ---- *************** vect_transform_slp_perm_load (gimple stm *** 2944,2949 **** --- 2838,2844 ---- { for (k = 0; k < group_size; k++) { + i = SLP_TREE_LOAD_PERMUTATION (node)[k]; first_mask_element = i + j * group_size; if (!vect_get_mask_element (stmt, first_mask_element, 0, nunits, only_one_vec, index, *************** vect_transform_slp_perm_load (gimple stm *** 2956,2964 **** if (index == nunits) { ! tree mask_vec, *mask_elts; ! int l; ! if (!can_vec_perm_p (mode, false, mask)) { if (dump_enabled_p ()) --- 2851,2857 ---- if (index == nunits) { ! index = 0; if (!can_vec_perm_p (mode, false, mask)) { if (dump_enabled_p ()) *************** vect_transform_slp_perm_load (gimple stm *** 2974,2988 **** return false; } - mask_elts = XALLOCAVEC (tree, nunits); - for (l = 0; l < nunits; ++l) - mask_elts[l] = build_int_cst (mask_element_type, mask[l]); - mask_vec = build_vector (mask_type, mask_elts); - index = 0; - if (!analyze_only) { ! if (need_next_vector) { first_vec_index = second_vec_index; second_vec_index = vec_index; --- 2867,2883 ---- return false; } if (!analyze_only) { ! int l; ! tree mask_vec, *mask_elts; ! mask_elts = XALLOCAVEC (tree, nunits); ! for (l = 0; l < nunits; ++l) ! mask_elts[l] = build_int_cst (mask_element_type, ! mask[l]); ! mask_vec = build_vector (mask_type, mask_elts); ! ! if (need_next_vector) { first_vec_index = second_vec_index; second_vec_index = vec_index; *************** vect_schedule_slp_instance (slp_tree nod *** 3019,3025 **** unsigned int vec_stmts_size, nunits, group_size; tree vectype; int i; - slp_tree loads_node; slp_tree child; if (!node) --- 2914,2919 ---- *************** vect_schedule_slp_instance (slp_tree nod *** 3043,3062 **** size. */ vec_stmts_size = (vectorization_factor * group_size) / nunits; - /* In case of load permutation we have to allocate vectorized statements for - all the nodes that participate in that permutation. */ - if (SLP_INSTANCE_LOAD_PERMUTATION (instance).exists ()) - { - FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, loads_node) - { - if (!SLP_TREE_VEC_STMTS (loads_node).exists ()) - { - SLP_TREE_VEC_STMTS (loads_node).create (vec_stmts_size); - SLP_TREE_NUMBER_OF_VEC_STMTS (loads_node) = vec_stmts_size; - } - } - } - if (!SLP_TREE_VEC_STMTS (node).exists ()) { SLP_TREE_VEC_STMTS (node).create (vec_stmts_size); --- 2937,2942 ---- *************** vect_schedule_slp_instance (slp_tree nod *** 3074,3080 **** if (SLP_INSTANCE_FIRST_LOAD_STMT (instance) && STMT_VINFO_GROUPED_ACCESS (stmt_info) && !REFERENCE_CLASS_P (gimple_get_lhs (stmt)) ! && SLP_INSTANCE_LOAD_PERMUTATION (instance).exists ()) si = gsi_for_stmt (SLP_INSTANCE_FIRST_LOAD_STMT (instance)); else if (is_pattern_stmt_p (stmt_info)) si = gsi_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info)); --- 2954,2960 ---- if (SLP_INSTANCE_FIRST_LOAD_STMT (instance) && STMT_VINFO_GROUPED_ACCESS (stmt_info) && !REFERENCE_CLASS_P (gimple_get_lhs (stmt)) ! && SLP_TREE_LOAD_PERMUTATION (node).exists ()) si = gsi_for_stmt (SLP_INSTANCE_FIRST_LOAD_STMT (instance)); else if (is_pattern_stmt_p (stmt_info)) si = gsi_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info)); *************** vect_schedule_slp (loop_vec_info loop_vi *** 3153,3160 **** { vec<slp_instance> slp_instances; slp_instance instance; ! slp_tree loads_node; ! unsigned int i, j, vf; bool is_store = false; if (loop_vinfo) --- 3033,3039 ---- { vec<slp_instance> slp_instances; slp_instance instance; ! unsigned int i, vf; bool is_store = false; if (loop_vinfo) *************** vect_schedule_slp (loop_vec_info loop_vi *** 3173,3186 **** /* Schedule the tree of INSTANCE. */ is_store = vect_schedule_slp_instance (SLP_INSTANCE_TREE (instance), instance, vf); - - /* Clear STMT_VINFO_VEC_STMT of all loads. With shared loads - between SLP instances we fail to properly initialize the - vectorized SLP stmts and confuse different load permutations. */ - FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, loads_node) - STMT_VINFO_VEC_STMT - (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (loads_node)[0])) = NULL; - if (dump_enabled_p ()) dump_printf_loc (MSG_NOTE, vect_location, "vectorizing stmts using SLP."); --- 3052,3057 ---- Index: trunk/gcc/tree-vect-stmts.c =================================================================== *** trunk.orig/gcc/tree-vect-stmts.c 2013-04-19 12:43:20.000000000 +0200 --- trunk/gcc/tree-vect-stmts.c 2013-04-19 13:02:46.114368141 +0200 *************** vectorizable_load (gimple stmt, gimple_s *** 4754,4765 **** { first_stmt = GROUP_FIRST_ELEMENT (stmt_info); if (slp ! && !SLP_INSTANCE_LOAD_PERMUTATION (slp_node_instance).exists () && first_stmt != SLP_TREE_SCALAR_STMTS (slp_node)[0]) first_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[0]; /* Check if the chain of loads is already vectorized. */ ! if (STMT_VINFO_VEC_STMT (vinfo_for_stmt (first_stmt))) { *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info); return true; --- 4754,4774 ---- { first_stmt = GROUP_FIRST_ELEMENT (stmt_info); if (slp ! && !SLP_TREE_LOAD_PERMUTATION (slp_node).exists () && first_stmt != SLP_TREE_SCALAR_STMTS (slp_node)[0]) first_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[0]; /* Check if the chain of loads is already vectorized. */ ! if (STMT_VINFO_VEC_STMT (vinfo_for_stmt (first_stmt)) ! /* For SLP we would need to copy over SLP_TREE_VEC_STMTS. ! ??? But we can only do so if there is exactly one ! as we have no way to get at the rest. Leave the CSE ! opportunity alone. ! ??? With the group load eventually participating ! in multiple different permutations (having multiple ! slp nodes which refer to the same group) the CSE ! is even wrong code. See PR56270. */ ! && !slp) { *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info); return true; *************** vectorizable_load (gimple stmt, gimple_s *** 4772,4778 **** { grouped_load = false; vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node); ! if (SLP_INSTANCE_LOAD_PERMUTATION (slp_node_instance).exists ()) slp_perm = true; group_gap = GROUP_GAP (vinfo_for_stmt (first_stmt)); } --- 4781,4787 ---- { grouped_load = false; vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node); ! if (SLP_TREE_LOAD_PERMUTATION (slp_node).exists ()) slp_perm = true; group_gap = GROUP_GAP (vinfo_for_stmt (first_stmt)); } *************** vectorizable_load (gimple stmt, gimple_s *** 5163,5169 **** if (slp_perm) { ! if (!vect_transform_slp_perm_load (stmt, dr_chain, gsi, vf, slp_node_instance, false)) { dr_chain.release (); --- 5172,5178 ---- if (slp_perm) { ! if (!vect_transform_slp_perm_load (slp_node, dr_chain, gsi, vf, slp_node_instance, false)) { dr_chain.release (); Index: trunk/gcc/tree-vectorizer.h =================================================================== *** trunk.orig/gcc/tree-vectorizer.h 2013-04-19 12:43:20.000000000 +0200 --- trunk/gcc/tree-vectorizer.h 2013-04-19 12:48:52.156018695 +0200 *************** struct _slp_tree { *** 106,111 **** --- 106,114 ---- vec<slp_tree> children; /* A group of scalar stmts to be vectorized together. */ vec<gimple> stmts; + /* Load permutation relative to the stores, NULL if there is no + permutation. */ + vec<unsigned> load_permutation; /* Vectorized stmt/s. */ vec<gimple> vec_stmts; /* Number of vector stmts that are created to replace the group of scalar *************** typedef struct _slp_instance { *** 131,140 **** /* Vectorization costs associated with SLP instance. */ stmt_vector_for_cost body_cost_vec; - /* Loads permutation relatively to the stores, NULL if there is no - permutation. */ - vec<int> load_permutation; - /* The group of nodes that contain loads of this SLP instance. */ vec<slp_tree> loads; --- 134,139 ---- *************** typedef struct _slp_instance { *** 149,155 **** #define SLP_INSTANCE_GROUP_SIZE(S) (S)->group_size #define SLP_INSTANCE_UNROLLING_FACTOR(S) (S)->unrolling_factor #define SLP_INSTANCE_BODY_COST_VEC(S) (S)->body_cost_vec - #define SLP_INSTANCE_LOAD_PERMUTATION(S) (S)->load_permutation #define SLP_INSTANCE_LOADS(S) (S)->loads #define SLP_INSTANCE_FIRST_LOAD_STMT(S) (S)->first_load --- 148,153 ---- *************** typedef struct _slp_instance { *** 157,162 **** --- 155,161 ---- #define SLP_TREE_SCALAR_STMTS(S) (S)->stmts #define SLP_TREE_VEC_STMTS(S) (S)->vec_stmts #define SLP_TREE_NUMBER_OF_VEC_STMTS(S) (S)->vec_stmts_size + #define SLP_TREE_LOAD_PERMUTATION(S) (S)->load_permutation /* This structure is used in creation of an SLP tree. Each instance corresponds to the same operand in a group of scalar stmts in an SLP *************** extern int vect_get_single_scalar_iterat *** 961,967 **** /* In tree-vect-slp.c. */ extern void vect_free_slp_instance (slp_instance); ! extern bool vect_transform_slp_perm_load (gimple, vec<tree> , gimple_stmt_iterator *, int, slp_instance, bool); extern bool vect_schedule_slp (loop_vec_info, bb_vec_info); --- 960,966 ---- /* In tree-vect-slp.c. */ extern void vect_free_slp_instance (slp_instance); ! extern bool vect_transform_slp_perm_load (slp_tree, vec<tree> , gimple_stmt_iterator *, int, slp_instance, bool); extern bool vect_schedule_slp (loop_vec_info, bb_vec_info);