The SLP code currently punts for all variable-length permutes. This patch makes it handle the easy case of N->N permutes in which the number of vector lanes is a multiple of N. Every permute then uses the same mask, and that mask repeats (with a stride) every N elements.
The patch uses the same path for constant-length vectors, since it should be slightly cheaper in terms of compile time. Tested on aarch64-linux-gnu (with and without SVE), aarch64_be-elf and x86_64-linux-gnu. OK to install? Richard 2018-08-23 Richard Sandiford <richard.sandif...@arm.com> gcc/ * tree-vect-slp.c (vect_transform_slp_perm_load): Separate out the case in which the permute needs only a single element and repeats for every vector of the result. Extend that case to handle variable-length vectors. * tree-vect-stmts.c (vectorizable_load): Update accordingly. gcc/testsuite/ * gcc.target/aarch64/sve/slp_perm_1.c: New test. * gcc.target/aarch64/sve/slp_perm_2.c: Likewise. * gcc.target/aarch64/sve/slp_perm_3.c: Likewise. * gcc.target/aarch64/sve/slp_perm_4.c: Likewise. * gcc.target/aarch64/sve/slp_perm_5.c: Likewise. * gcc.target/aarch64/sve/slp_perm_6.c: Likewise. * gcc.target/aarch64/sve/slp_perm_7.c: Likewise. Index: gcc/tree-vect-slp.c =================================================================== *** gcc/tree-vect-slp.c 2018-08-21 14:47:08.339163256 +0100 --- gcc/tree-vect-slp.c 2018-08-23 09:59:35.245682525 +0100 *************** vect_transform_slp_perm_load (slp_tree n *** 3606,3618 **** { stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0]; vec_info *vinfo = stmt_info->vinfo; - tree mask_element_type = NULL_TREE, mask_type; int vec_index = 0; tree vectype = STMT_VINFO_VECTYPE (stmt_info); ! int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance); unsigned int mask_element; machine_mode mode; - unsigned HOST_WIDE_INT nunits, const_vf; if (!STMT_VINFO_GROUPED_ACCESS (stmt_info)) return false; --- 3606,3616 ---- { stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0]; vec_info *vinfo = stmt_info->vinfo; int vec_index = 0; tree vectype = STMT_VINFO_VECTYPE (stmt_info); ! unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance); unsigned int mask_element; machine_mode mode; if (!STMT_VINFO_GROUPED_ACCESS (stmt_info)) return false; *************** vect_transform_slp_perm_load (slp_tree n *** 3620,3641 **** stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info); mode = TYPE_MODE (vectype); ! ! /* At the moment, all permutations are represented using per-element ! indices, so we can't cope with variable vector lengths or ! vectorization factors. */ ! if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nunits) ! || !vf.is_constant (&const_vf)) ! return false; ! ! /* The generic VEC_PERM_EXPR code always uses an integral type of the ! same size as the vector element being permuted. */ ! mask_element_type = lang_hooks.types.type_for_mode ! (int_mode_for_mode (TYPE_MODE (TREE_TYPE (vectype))).require (), 1); ! mask_type = get_vectype_for_scalar_type (mask_element_type); ! vec_perm_builder mask (nunits, nunits, 1); ! mask.quick_grow (nunits); ! vec_perm_indices indices; /* Initialize the vect stmts of NODE to properly insert the generated stmts later. */ --- 3618,3624 ---- stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info); mode = TYPE_MODE (vectype); ! poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype); /* Initialize the vect stmts of NODE to properly insert the generated stmts later. */ *************** vect_transform_slp_perm_load (slp_tree n *** 3669,3682 **** bool noop_p = true; *n_perms = 0; ! for (unsigned int j = 0; j < const_vf; j++) { ! for (int k = 0; k < group_size; k++) { ! unsigned int i = (SLP_TREE_LOAD_PERMUTATION (node)[k] ! + j * DR_GROUP_SIZE (stmt_info)); ! vec_index = i / nunits; ! mask_element = i % nunits; if (vec_index == first_vec_index || first_vec_index == -1) { --- 3652,3704 ---- bool noop_p = true; *n_perms = 0; ! vec_perm_builder mask; ! unsigned int nelts_to_build; ! unsigned int nvectors_per_build; ! bool repeating_p = (group_size == DR_GROUP_SIZE (stmt_info) ! && multiple_p (nunits, group_size)); ! if (repeating_p) ! { ! /* A single vector contains a whole number of copies of the node, so: ! (a) all permutes can use the same mask; and ! (b) the permutes only need a single vector input. */ ! mask.new_vector (nunits, group_size, 3); ! nelts_to_build = mask.encoded_nelts (); ! nvectors_per_build = SLP_TREE_VEC_STMTS (node).length (); ! } ! else ! { ! /* We need to construct a separate mask for each vector statement. */ ! unsigned HOST_WIDE_INT const_nunits, const_vf; ! if (!nunits.is_constant (&const_nunits) ! || !vf.is_constant (&const_vf)) ! return false; ! mask.new_vector (const_nunits, const_nunits, 1); ! nelts_to_build = const_vf * group_size; ! nvectors_per_build = 1; ! } ! ! unsigned int count = mask.encoded_nelts (); ! mask.quick_grow (count); ! vec_perm_indices indices; ! ! for (unsigned int j = 0; j < nelts_to_build; j++) { ! unsigned int iter_num = j / group_size; ! unsigned int stmt_num = j % group_size; ! unsigned int i = (iter_num * DR_GROUP_SIZE (stmt_info) ! + SLP_TREE_LOAD_PERMUTATION (node)[stmt_num]); ! if (repeating_p) { ! first_vec_index = 0; ! mask_element = i; ! } ! else ! { ! /* Enforced before the loop when !repeating_p. */ ! unsigned int const_nunits = nunits.to_constant (); ! vec_index = i / const_nunits; ! mask_element = i % const_nunits; if (vec_index == first_vec_index || first_vec_index == -1) { *************** vect_transform_slp_perm_load (slp_tree n *** 3686,3692 **** || second_vec_index == -1) { second_vec_index = vec_index; ! mask_element += nunits; } else { --- 3708,3714 ---- || second_vec_index == -1) { second_vec_index = vec_index; ! mask_element += const_nunits; } else { *************** vect_transform_slp_perm_load (slp_tree n *** 3702,3751 **** return false; } ! gcc_assert (mask_element < 2 * nunits); ! if (mask_element != index) ! noop_p = false; ! mask[index++] = mask_element; ! if (index == nunits && !noop_p) { ! indices.new_vector (mask, 2, nunits); ! if (!can_vec_perm_const_p (mode, indices)) { ! if (dump_enabled_p ()) { ! dump_printf_loc (MSG_MISSED_OPTIMIZATION, ! vect_location, ! "unsupported vect permute { "); ! for (i = 0; i < nunits; ++i) ! { ! dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]); ! dump_printf (MSG_MISSED_OPTIMIZATION, " "); ! } ! dump_printf (MSG_MISSED_OPTIMIZATION, "}\n"); } ! gcc_assert (analyze_only); ! return false; } ! ! ++*n_perms; } ! if (index == nunits) { ! if (!analyze_only) ! { ! tree mask_vec = NULL_TREE; ! if (! noop_p) ! mask_vec = vec_perm_indices_to_tree (mask_type, indices); ! if (second_vec_index == -1) ! second_vec_index = first_vec_index; /* Generate the permute statement if necessary. */ ! tree first_vec = dr_chain[first_vec_index]; ! tree second_vec = dr_chain[second_vec_index]; stmt_vec_info perm_stmt_info; if (! noop_p) { --- 3724,3777 ---- return false; } ! gcc_assert (mask_element < 2 * const_nunits); ! } ! ! if (mask_element != index) ! noop_p = false; ! mask[index++] = mask_element; ! if (index == count && !noop_p) ! { ! indices.new_vector (mask, second_vec_index == -1 ? 1 : 2, nunits); ! if (!can_vec_perm_const_p (mode, indices)) { ! if (dump_enabled_p ()) { ! dump_printf_loc (MSG_MISSED_OPTIMIZATION, ! vect_location, ! "unsupported vect permute { "); ! for (i = 0; i < count; ++i) { ! dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]); ! dump_printf (MSG_MISSED_OPTIMIZATION, " "); } ! dump_printf (MSG_MISSED_OPTIMIZATION, "}\n"); } ! gcc_assert (analyze_only); ! return false; } ! ++*n_perms; ! } ! ! if (index == count) ! { ! if (!analyze_only) { ! tree mask_vec = NULL_TREE; ! if (! noop_p) ! mask_vec = vect_gen_perm_mask_checked (vectype, indices); ! if (second_vec_index == -1) ! second_vec_index = first_vec_index; + for (unsigned int ri = 0; ri < nvectors_per_build; ++ri) + { /* Generate the permute statement if necessary. */ ! tree first_vec = dr_chain[first_vec_index + ri]; ! tree second_vec = dr_chain[second_vec_index + ri]; stmt_vec_info perm_stmt_info; if (! noop_p) { *************** vect_transform_slp_perm_load (slp_tree n *** 3771,3782 **** SLP_TREE_VEC_STMTS (node)[vect_stmts_counter++] = perm_stmt_info; } - - index = 0; - first_vec_index = -1; - second_vec_index = -1; - noop_p = true; } } } --- 3797,3808 ---- SLP_TREE_VEC_STMTS (node)[vect_stmts_counter++] = perm_stmt_info; } } + + index = 0; + first_vec_index = -1; + second_vec_index = -1; + noop_p = true; } } Index: gcc/tree-vect-stmts.c =================================================================== *** gcc/tree-vect-stmts.c 2018-08-22 13:58:45.652208084 +0100 --- gcc/tree-vect-stmts.c 2018-08-23 09:59:35.245682525 +0100 *************** vectorizable_load (stmt_vec_info stmt_in *** 8003,8015 **** if (slp) { grouped_load = false; ! /* For SLP permutation support we need to load the whole group, ! not only the number of vector stmts the permutation result ! fits in. */ ! if (slp_perm) { ! /* We don't yet generate SLP_TREE_LOAD_PERMUTATIONs for ! variable VF. */ unsigned int const_vf = vf.to_constant (); unsigned int const_nunits = nunits.to_constant (); vec_num = CEIL (group_size * const_vf, const_nunits); --- 8003,8020 ---- if (slp) { grouped_load = false; ! /* If an SLP permutation is from N elements to N elements, ! and if one vector holds a whole number of N, we can load ! the inputs to the permutation in the same way as an ! unpermuted sequence. In other cases we need to load the ! whole group, not only the number of vector stmts the ! permutation result fits in. */ ! if (slp_perm ! && (group_size != SLP_INSTANCE_GROUP_SIZE (slp_node_instance) ! || !multiple_p (nunits, group_size))) { ! /* We don't yet generate such SLP_TREE_LOAD_PERMUTATIONs for ! variable VF; see vect_transform_slp_perm_load. */ unsigned int const_vf = vf.to_constant (); unsigned int const_nunits = nunits.to_constant (); vec_num = CEIL (group_size * const_vf, const_nunits); Index: gcc/testsuite/gcc.target/aarch64/sve/slp_perm_1.c =================================================================== *** /dev/null 2018-07-26 10:26:13.137955424 +0100 --- gcc/testsuite/gcc.target/aarch64/sve/slp_perm_1.c 2018-08-23 09:59:35.245682525 +0100 *************** *** 0 **** --- 1,22 ---- + /* { dg-do compile } */ + /* { dg-options "-O2 -ftree-vectorize" } */ + + #include <stdint.h> + + void + f (uint8_t *restrict a, uint8_t *restrict b) + { + for (int i = 0; i < 100; ++i) + { + a[i * 8] = b[i * 8 + 7] + 1; + a[i * 8 + 1] = b[i * 8 + 6] + 2; + a[i * 8 + 2] = b[i * 8 + 5] + 3; + a[i * 8 + 3] = b[i * 8 + 4] + 4; + a[i * 8 + 4] = b[i * 8 + 3] + 5; + a[i * 8 + 5] = b[i * 8 + 2] + 6; + a[i * 8 + 6] = b[i * 8 + 1] + 7; + a[i * 8 + 7] = b[i * 8 + 0] + 8; + } + } + + /* { dg-final { scan-assembler-times {\trevb\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d\n} 1 } } */ Index: gcc/testsuite/gcc.target/aarch64/sve/slp_perm_2.c =================================================================== *** /dev/null 2018-07-26 10:26:13.137955424 +0100 --- gcc/testsuite/gcc.target/aarch64/sve/slp_perm_2.c 2018-08-23 09:59:35.245682525 +0100 *************** *** 0 **** --- 1,22 ---- + /* { dg-do compile } */ + /* { dg-options "-O2 -ftree-vectorize" } */ + + #include <stdint.h> + + void + f (uint8_t *restrict a, uint8_t *restrict b) + { + for (int i = 0; i < 100; ++i) + { + a[i * 8] = b[i * 8 + 3] + 1; + a[i * 8 + 1] = b[i * 8 + 2] + 2; + a[i * 8 + 2] = b[i * 8 + 1] + 3; + a[i * 8 + 3] = b[i * 8 + 0] + 4; + a[i * 8 + 4] = b[i * 8 + 7] + 5; + a[i * 8 + 5] = b[i * 8 + 6] + 6; + a[i * 8 + 6] = b[i * 8 + 5] + 7; + a[i * 8 + 7] = b[i * 8 + 4] + 8; + } + } + + /* { dg-final { scan-assembler-times {\trevb\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s\n} 1 } } */ Index: gcc/testsuite/gcc.target/aarch64/sve/slp_perm_3.c =================================================================== *** /dev/null 2018-07-26 10:26:13.137955424 +0100 --- gcc/testsuite/gcc.target/aarch64/sve/slp_perm_3.c 2018-08-23 09:59:35.245682525 +0100 *************** *** 0 **** --- 1,22 ---- + /* { dg-do compile } */ + /* { dg-options "-O2 -ftree-vectorize" } */ + + #include <stdint.h> + + void + f (uint8_t *restrict a, uint8_t *restrict b) + { + for (int i = 0; i < 100; ++i) + { + a[i * 8] = b[i * 8 + 1] + 1; + a[i * 8 + 1] = b[i * 8 + 0] + 2; + a[i * 8 + 2] = b[i * 8 + 3] + 3; + a[i * 8 + 3] = b[i * 8 + 2] + 4; + a[i * 8 + 4] = b[i * 8 + 5] + 5; + a[i * 8 + 5] = b[i * 8 + 4] + 6; + a[i * 8 + 6] = b[i * 8 + 7] + 7; + a[i * 8 + 7] = b[i * 8 + 6] + 8; + } + } + + /* { dg-final { scan-assembler-times {\trevb\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h\n} 1 } } */ Index: gcc/testsuite/gcc.target/aarch64/sve/slp_perm_4.c =================================================================== *** /dev/null 2018-07-26 10:26:13.137955424 +0100 --- gcc/testsuite/gcc.target/aarch64/sve/slp_perm_4.c 2018-08-23 09:59:35.245682525 +0100 *************** *** 0 **** --- 1,22 ---- + /* { dg-do compile } */ + /* { dg-options "-O2 -ftree-vectorize" } */ + + #include <stdint.h> + + void + f (uint8_t *restrict a, uint8_t *restrict b, uint8_t *restrict c) + { + for (int i = 0; i < 100; ++i) + { + a[i * 8] = b[i * 8] + c[i * 8]; + a[i * 8 + 1] = b[i * 8] + c[i * 8 + 1]; + a[i * 8 + 2] = b[i * 8 + 2] + c[i * 8 + 2]; + a[i * 8 + 3] = b[i * 8 + 2] + c[i * 8 + 3]; + a[i * 8 + 4] = b[i * 8 + 4] + c[i * 8 + 4]; + a[i * 8 + 5] = b[i * 8 + 4] + c[i * 8 + 5]; + a[i * 8 + 6] = b[i * 8 + 6] + c[i * 8 + 6]; + a[i * 8 + 7] = b[i * 8 + 6] + c[i * 8 + 7]; + } + } + + /* { dg-final { scan-assembler {\ttrn1\tz[0-9]+\.b, z[0-9]+\.b, z[0-9]+\.b\n} } } */ Index: gcc/testsuite/gcc.target/aarch64/sve/slp_perm_5.c =================================================================== *** /dev/null 2018-07-26 10:26:13.137955424 +0100 --- gcc/testsuite/gcc.target/aarch64/sve/slp_perm_5.c 2018-08-23 09:59:35.245682525 +0100 *************** *** 0 **** --- 1,32 ---- + /* { dg-do compile } */ + /* { dg-options "-O2 -ftree-vectorize" } */ + + #include <stdint.h> + + void + f (uint8_t *restrict a, uint8_t *restrict b, + uint8_t *restrict c, uint8_t *restrict d) + { + for (int i = 0; i < 100; ++i) + { + a[i * 8] = c[i * 8] + d[i * 8]; + a[i * 8 + 1] = c[i * 8] + d[i * 8 + 1]; + a[i * 8 + 2] = c[i * 8 + 2] + d[i * 8 + 2]; + a[i * 8 + 3] = c[i * 8 + 2] + d[i * 8 + 3]; + a[i * 8 + 4] = c[i * 8 + 4] + d[i * 8 + 4]; + a[i * 8 + 5] = c[i * 8 + 4] + d[i * 8 + 5]; + a[i * 8 + 6] = c[i * 8 + 6] + d[i * 8 + 6]; + a[i * 8 + 7] = c[i * 8 + 6] + d[i * 8 + 7]; + b[i * 8] = c[i * 8 + 1] + d[i * 8]; + b[i * 8 + 1] = c[i * 8 + 1] + d[i * 8 + 1]; + b[i * 8 + 2] = c[i * 8 + 3] + d[i * 8 + 2]; + b[i * 8 + 3] = c[i * 8 + 3] + d[i * 8 + 3]; + b[i * 8 + 4] = c[i * 8 + 5] + d[i * 8 + 4]; + b[i * 8 + 5] = c[i * 8 + 5] + d[i * 8 + 5]; + b[i * 8 + 6] = c[i * 8 + 7] + d[i * 8 + 6]; + b[i * 8 + 7] = c[i * 8 + 7] + d[i * 8 + 7]; + } + } + + /* { dg-final { scan-assembler {\ttrn1\tz[0-9]+\.b, z[0-9]+\.b, z[0-9]+\.b\n} } } */ + /* { dg-final { scan-assembler {\ttrn2\tz[0-9]+\.b, z[0-9]+\.b, z[0-9]+\.b\n} } } */ Index: gcc/testsuite/gcc.target/aarch64/sve/slp_perm_6.c =================================================================== *** /dev/null 2018-07-26 10:26:13.137955424 +0100 --- gcc/testsuite/gcc.target/aarch64/sve/slp_perm_6.c 2018-08-23 09:59:35.245682525 +0100 *************** *** 0 **** --- 1,22 ---- + /* { dg-do compile } */ + /* { dg-options "-O2 -ftree-vectorize" } */ + + #include <stdint.h> + + void + f (uint8_t *restrict a, uint8_t *restrict b) + { + for (int i = 0; i < 100; ++i) + { + a[i * 8] = b[i * 8 + 3] + 1; + a[i * 8 + 1] = b[i * 8 + 6] + 1; + a[i * 8 + 2] = b[i * 8 + 0] + 1; + a[i * 8 + 3] = b[i * 8 + 2] + 1; + a[i * 8 + 4] = b[i * 8 + 1] + 1; + a[i * 8 + 5] = b[i * 8 + 7] + 1; + a[i * 8 + 6] = b[i * 8 + 5] + 1; + a[i * 8 + 7] = b[i * 8 + 4] + 1; + } + } + + /* { dg-final { scan-assembler-times {\ttbl\tz[0-9]+\.b, z[0-9]+\.b, z[0-9]+\.b\n} 1 } } */ Index: gcc/testsuite/gcc.target/aarch64/sve/slp_perm_7.c =================================================================== *** /dev/null 2018-07-26 10:26:13.137955424 +0100 --- gcc/testsuite/gcc.target/aarch64/sve/slp_perm_7.c 2018-08-23 09:59:35.245682525 +0100 *************** *** 0 **** --- 1,22 ---- + /* { dg-do compile } */ + /* { dg-options "-O2 -ftree-vectorize" } */ + + #include <stdint.h> + + void + f (uint8_t *restrict a, uint8_t *restrict b) + { + for (int i = 0; i < 100; ++i) + { + a[i * 8] = b[i * 8 + 1] + 1; + a[i * 8 + 1] = b[i * 8 + 7] + 2; + a[i * 8 + 2] = b[i * 8 + 1] + 3; + a[i * 8 + 3] = b[i * 8 + 7] + 4; + a[i * 8 + 4] = b[i * 8 + 1] + 5; + a[i * 8 + 5] = b[i * 8 + 7] + 6; + a[i * 8 + 6] = b[i * 8 + 1] + 7; + a[i * 8 + 7] = b[i * 8 + 7] + 8; + } + } + + /* { dg-final { scan-assembler {\ttbl\tz[0-9]+\.b, z[0-9]+\.b, z[0-9]+\.b\n} } } */