The SLP code currently punts for all variable-length permutes.
This patch makes it handle the easy case of N->N permutes in which
the number of vector lanes is a multiple of N.  Every permute then
uses the same mask, and that mask repeats (with a stride) every
N elements.

The patch uses the same path for constant-length vectors,
since it should be slightly cheaper in terms of compile time.

Tested on aarch64-linux-gnu (with and without SVE), aarch64_be-elf
and x86_64-linux-gnu.  OK to install?

Richard


2018-08-23  Richard Sandiford  <richard.sandif...@arm.com>

gcc/
        * tree-vect-slp.c (vect_transform_slp_perm_load): Separate out
        the case in which the permute needs only a single element and
        repeats for every vector of the result.  Extend that case to
        handle variable-length vectors.
        * tree-vect-stmts.c (vectorizable_load): Update accordingly.

gcc/testsuite/
        * gcc.target/aarch64/sve/slp_perm_1.c: New test.
        * gcc.target/aarch64/sve/slp_perm_2.c: Likewise.
        * gcc.target/aarch64/sve/slp_perm_3.c: Likewise.
        * gcc.target/aarch64/sve/slp_perm_4.c: Likewise.
        * gcc.target/aarch64/sve/slp_perm_5.c: Likewise.
        * gcc.target/aarch64/sve/slp_perm_6.c: Likewise.
        * gcc.target/aarch64/sve/slp_perm_7.c: Likewise.

Index: gcc/tree-vect-slp.c
===================================================================
*** gcc/tree-vect-slp.c 2018-08-21 14:47:08.339163256 +0100
--- gcc/tree-vect-slp.c 2018-08-23 09:59:35.245682525 +0100
*************** vect_transform_slp_perm_load (slp_tree n
*** 3606,3618 ****
  {
    stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
    vec_info *vinfo = stmt_info->vinfo;
-   tree mask_element_type = NULL_TREE, mask_type;
    int vec_index = 0;
    tree vectype = STMT_VINFO_VECTYPE (stmt_info);
!   int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance);
    unsigned int mask_element;
    machine_mode mode;
-   unsigned HOST_WIDE_INT nunits, const_vf;
  
    if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
      return false;
--- 3606,3616 ----
  {
    stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
    vec_info *vinfo = stmt_info->vinfo;
    int vec_index = 0;
    tree vectype = STMT_VINFO_VECTYPE (stmt_info);
!   unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance);
    unsigned int mask_element;
    machine_mode mode;
  
    if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
      return false;
*************** vect_transform_slp_perm_load (slp_tree n
*** 3620,3641 ****
    stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
  
    mode = TYPE_MODE (vectype);
! 
!   /* At the moment, all permutations are represented using per-element
!      indices, so we can't cope with variable vector lengths or
!      vectorization factors.  */
!   if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nunits)
!       || !vf.is_constant (&const_vf))
!     return false;
! 
!   /* The generic VEC_PERM_EXPR code always uses an integral type of the
!      same size as the vector element being permuted.  */
!   mask_element_type = lang_hooks.types.type_for_mode
!     (int_mode_for_mode (TYPE_MODE (TREE_TYPE (vectype))).require (), 1);
!   mask_type = get_vectype_for_scalar_type (mask_element_type);
!   vec_perm_builder mask (nunits, nunits, 1);
!   mask.quick_grow (nunits);
!   vec_perm_indices indices;
  
    /* Initialize the vect stmts of NODE to properly insert the generated
       stmts later.  */
--- 3618,3624 ----
    stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
  
    mode = TYPE_MODE (vectype);
!   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
  
    /* Initialize the vect stmts of NODE to properly insert the generated
       stmts later.  */
*************** vect_transform_slp_perm_load (slp_tree n
*** 3669,3682 ****
    bool noop_p = true;
    *n_perms = 0;
  
!   for (unsigned int j = 0; j < const_vf; j++)
      {
!       for (int k = 0; k < group_size; k++)
        {
!         unsigned int i = (SLP_TREE_LOAD_PERMUTATION (node)[k]
!                           + j * DR_GROUP_SIZE (stmt_info));
!         vec_index = i / nunits;
!         mask_element = i % nunits;
          if (vec_index == first_vec_index
              || first_vec_index == -1)
            {
--- 3652,3704 ----
    bool noop_p = true;
    *n_perms = 0;
  
!   vec_perm_builder mask;
!   unsigned int nelts_to_build;
!   unsigned int nvectors_per_build;
!   bool repeating_p = (group_size == DR_GROUP_SIZE (stmt_info)
!                     && multiple_p (nunits, group_size));
!   if (repeating_p)
!     {
!       /* A single vector contains a whole number of copies of the node, so:
!        (a) all permutes can use the same mask; and
!        (b) the permutes only need a single vector input.  */
!       mask.new_vector (nunits, group_size, 3);
!       nelts_to_build = mask.encoded_nelts ();
!       nvectors_per_build = SLP_TREE_VEC_STMTS (node).length ();
!     }
!   else
!     {
!       /* We need to construct a separate mask for each vector statement.  */
!       unsigned HOST_WIDE_INT const_nunits, const_vf;
!       if (!nunits.is_constant (&const_nunits)
!         || !vf.is_constant (&const_vf))
!       return false;
!       mask.new_vector (const_nunits, const_nunits, 1);
!       nelts_to_build = const_vf * group_size;
!       nvectors_per_build = 1;
!     }
! 
!   unsigned int count = mask.encoded_nelts ();
!   mask.quick_grow (count);
!   vec_perm_indices indices;
! 
!   for (unsigned int j = 0; j < nelts_to_build; j++)
      {
!       unsigned int iter_num = j / group_size;
!       unsigned int stmt_num = j % group_size;
!       unsigned int i = (iter_num * DR_GROUP_SIZE (stmt_info)
!                       + SLP_TREE_LOAD_PERMUTATION (node)[stmt_num]);
!       if (repeating_p)
        {
!         first_vec_index = 0;
!         mask_element = i;
!       }
!       else
!       {
!         /* Enforced before the loop when !repeating_p.  */
!         unsigned int const_nunits = nunits.to_constant ();
!         vec_index = i / const_nunits;
!         mask_element = i % const_nunits;
          if (vec_index == first_vec_index
              || first_vec_index == -1)
            {
*************** vect_transform_slp_perm_load (slp_tree n
*** 3686,3692 ****
                   || second_vec_index == -1)
            {
              second_vec_index = vec_index;
!             mask_element += nunits;
            }
          else
            {
--- 3708,3714 ----
                   || second_vec_index == -1)
            {
              second_vec_index = vec_index;
!             mask_element += const_nunits;
            }
          else
            {
*************** vect_transform_slp_perm_load (slp_tree n
*** 3702,3751 ****
              return false;
            }
  
!         gcc_assert (mask_element < 2 * nunits);
!         if (mask_element != index)
!           noop_p = false;
!         mask[index++] = mask_element;
  
!         if (index == nunits && !noop_p)
            {
!             indices.new_vector (mask, 2, nunits);
!             if (!can_vec_perm_const_p (mode, indices))
                {
!                 if (dump_enabled_p ())
                    {
!                     dump_printf_loc (MSG_MISSED_OPTIMIZATION,
!                                      vect_location, 
!                                      "unsupported vect permute { ");
!                     for (i = 0; i < nunits; ++i)
!                       {
!                         dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
!                         dump_printf (MSG_MISSED_OPTIMIZATION, " ");
!                       }
!                     dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
                    }
!                 gcc_assert (analyze_only);
!                 return false;
                }
! 
!             ++*n_perms;
            }
  
!         if (index == nunits)
            {
!             if (!analyze_only)
!               {
!                 tree mask_vec = NULL_TREE;
                  
!                 if (! noop_p)
!                   mask_vec = vec_perm_indices_to_tree (mask_type, indices);
  
!                 if (second_vec_index == -1)
!                   second_vec_index = first_vec_index;
  
                  /* Generate the permute statement if necessary.  */
!                 tree first_vec = dr_chain[first_vec_index];
!                 tree second_vec = dr_chain[second_vec_index];
                  stmt_vec_info perm_stmt_info;
                  if (! noop_p)
                    {
--- 3724,3777 ----
              return false;
            }
  
!         gcc_assert (mask_element < 2 * const_nunits);
!       }
! 
!       if (mask_element != index)
!       noop_p = false;
!       mask[index++] = mask_element;
  
!       if (index == count && !noop_p)
!       {
!         indices.new_vector (mask, second_vec_index == -1 ? 1 : 2, nunits);
!         if (!can_vec_perm_const_p (mode, indices))
            {
!             if (dump_enabled_p ())
                {
!                 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
!                                  vect_location,
!                                  "unsupported vect permute { ");
!                 for (i = 0; i < count; ++i)
                    {
!                     dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
!                     dump_printf (MSG_MISSED_OPTIMIZATION, " ");
                    }
!                 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
                }
!             gcc_assert (analyze_only);
!             return false;
            }
  
!         ++*n_perms;
!       }
! 
!       if (index == count)
!       {
!         if (!analyze_only)
            {
!             tree mask_vec = NULL_TREE;
                  
!             if (! noop_p)
!               mask_vec = vect_gen_perm_mask_checked (vectype, indices);
  
!             if (second_vec_index == -1)
!               second_vec_index = first_vec_index;
  
+             for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
+               {
                  /* Generate the permute statement if necessary.  */
!                 tree first_vec = dr_chain[first_vec_index + ri];
!                 tree second_vec = dr_chain[second_vec_index + ri];
                  stmt_vec_info perm_stmt_info;
                  if (! noop_p)
                    {
*************** vect_transform_slp_perm_load (slp_tree n
*** 3771,3782 ****
                  SLP_TREE_VEC_STMTS (node)[vect_stmts_counter++]
                    = perm_stmt_info;
                }
- 
-             index = 0;
-             first_vec_index = -1;
-             second_vec_index = -1;
-             noop_p = true;
            }
        }
      }
  
--- 3797,3808 ----
                  SLP_TREE_VEC_STMTS (node)[vect_stmts_counter++]
                    = perm_stmt_info;
                }
            }
+ 
+         index = 0;
+         first_vec_index = -1;
+         second_vec_index = -1;
+         noop_p = true;
        }
      }
  
Index: gcc/tree-vect-stmts.c
===================================================================
*** gcc/tree-vect-stmts.c       2018-08-22 13:58:45.652208084 +0100
--- gcc/tree-vect-stmts.c       2018-08-23 09:59:35.245682525 +0100
*************** vectorizable_load (stmt_vec_info stmt_in
*** 8003,8015 ****
        if (slp)
        {
          grouped_load = false;
!         /* For SLP permutation support we need to load the whole group,
!            not only the number of vector stmts the permutation result
!            fits in.  */
!         if (slp_perm)
            {
!             /* We don't yet generate SLP_TREE_LOAD_PERMUTATIONs for
!                variable VF.  */
              unsigned int const_vf = vf.to_constant ();
              unsigned int const_nunits = nunits.to_constant ();
              vec_num = CEIL (group_size * const_vf, const_nunits);
--- 8003,8020 ----
        if (slp)
        {
          grouped_load = false;
!         /* If an SLP permutation is from N elements to N elements,
!            and if one vector holds a whole number of N, we can load
!            the inputs to the permutation in the same way as an
!            unpermuted sequence.  In other cases we need to load the
!            whole group, not only the number of vector stmts the
!            permutation result fits in.  */
!         if (slp_perm
!             && (group_size != SLP_INSTANCE_GROUP_SIZE (slp_node_instance)
!                 || !multiple_p (nunits, group_size)))
            {
!             /* We don't yet generate such SLP_TREE_LOAD_PERMUTATIONs for
!                variable VF; see vect_transform_slp_perm_load.  */
              unsigned int const_vf = vf.to_constant ();
              unsigned int const_nunits = nunits.to_constant ();
              vec_num = CEIL (group_size * const_vf, const_nunits);
Index: gcc/testsuite/gcc.target/aarch64/sve/slp_perm_1.c
===================================================================
*** /dev/null   2018-07-26 10:26:13.137955424 +0100
--- gcc/testsuite/gcc.target/aarch64/sve/slp_perm_1.c   2018-08-23 
09:59:35.245682525 +0100
***************
*** 0 ****
--- 1,22 ----
+ /* { dg-do compile } */
+ /* { dg-options "-O2 -ftree-vectorize" } */
+ 
+ #include <stdint.h>
+ 
+ void
+ f (uint8_t *restrict a, uint8_t *restrict b)
+ {
+   for (int i = 0; i < 100; ++i)
+     {
+       a[i * 8] = b[i * 8 + 7] + 1;
+       a[i * 8 + 1] = b[i * 8 + 6] + 2;
+       a[i * 8 + 2] = b[i * 8 + 5] + 3;
+       a[i * 8 + 3] = b[i * 8 + 4] + 4;
+       a[i * 8 + 4] = b[i * 8 + 3] + 5;
+       a[i * 8 + 5] = b[i * 8 + 2] + 6;
+       a[i * 8 + 6] = b[i * 8 + 1] + 7;
+       a[i * 8 + 7] = b[i * 8 + 0] + 8;
+     }
+ }
+ 
+ /* { dg-final { scan-assembler-times {\trevb\tz[0-9]+\.d, p[0-7]/m, 
z[0-9]+\.d\n} 1 } } */
Index: gcc/testsuite/gcc.target/aarch64/sve/slp_perm_2.c
===================================================================
*** /dev/null   2018-07-26 10:26:13.137955424 +0100
--- gcc/testsuite/gcc.target/aarch64/sve/slp_perm_2.c   2018-08-23 
09:59:35.245682525 +0100
***************
*** 0 ****
--- 1,22 ----
+ /* { dg-do compile } */
+ /* { dg-options "-O2 -ftree-vectorize" } */
+ 
+ #include <stdint.h>
+ 
+ void
+ f (uint8_t *restrict a, uint8_t *restrict b)
+ {
+   for (int i = 0; i < 100; ++i)
+     {
+       a[i * 8] = b[i * 8 + 3] + 1;
+       a[i * 8 + 1] = b[i * 8 + 2] + 2;
+       a[i * 8 + 2] = b[i * 8 + 1] + 3;
+       a[i * 8 + 3] = b[i * 8 + 0] + 4;
+       a[i * 8 + 4] = b[i * 8 + 7] + 5;
+       a[i * 8 + 5] = b[i * 8 + 6] + 6;
+       a[i * 8 + 6] = b[i * 8 + 5] + 7;
+       a[i * 8 + 7] = b[i * 8 + 4] + 8;
+     }
+ }
+ 
+ /* { dg-final { scan-assembler-times {\trevb\tz[0-9]+\.s, p[0-7]/m, 
z[0-9]+\.s\n} 1 } } */
Index: gcc/testsuite/gcc.target/aarch64/sve/slp_perm_3.c
===================================================================
*** /dev/null   2018-07-26 10:26:13.137955424 +0100
--- gcc/testsuite/gcc.target/aarch64/sve/slp_perm_3.c   2018-08-23 
09:59:35.245682525 +0100
***************
*** 0 ****
--- 1,22 ----
+ /* { dg-do compile } */
+ /* { dg-options "-O2 -ftree-vectorize" } */
+ 
+ #include <stdint.h>
+ 
+ void
+ f (uint8_t *restrict a, uint8_t *restrict b)
+ {
+   for (int i = 0; i < 100; ++i)
+     {
+       a[i * 8] = b[i * 8 + 1] + 1;
+       a[i * 8 + 1] = b[i * 8 + 0] + 2;
+       a[i * 8 + 2] = b[i * 8 + 3] + 3;
+       a[i * 8 + 3] = b[i * 8 + 2] + 4;
+       a[i * 8 + 4] = b[i * 8 + 5] + 5;
+       a[i * 8 + 5] = b[i * 8 + 4] + 6;
+       a[i * 8 + 6] = b[i * 8 + 7] + 7;
+       a[i * 8 + 7] = b[i * 8 + 6] + 8;
+     }
+ }
+ 
+ /* { dg-final { scan-assembler-times {\trevb\tz[0-9]+\.h, p[0-7]/m, 
z[0-9]+\.h\n} 1 } } */
Index: gcc/testsuite/gcc.target/aarch64/sve/slp_perm_4.c
===================================================================
*** /dev/null   2018-07-26 10:26:13.137955424 +0100
--- gcc/testsuite/gcc.target/aarch64/sve/slp_perm_4.c   2018-08-23 
09:59:35.245682525 +0100
***************
*** 0 ****
--- 1,22 ----
+ /* { dg-do compile } */
+ /* { dg-options "-O2 -ftree-vectorize" } */
+ 
+ #include <stdint.h>
+ 
+ void
+ f (uint8_t *restrict a, uint8_t *restrict b, uint8_t *restrict c)
+ {
+   for (int i = 0; i < 100; ++i)
+     {
+       a[i * 8] = b[i * 8] + c[i * 8];
+       a[i * 8 + 1] = b[i * 8] + c[i * 8 + 1];
+       a[i * 8 + 2] = b[i * 8 + 2] + c[i * 8 + 2];
+       a[i * 8 + 3] = b[i * 8 + 2] + c[i * 8 + 3];
+       a[i * 8 + 4] = b[i * 8 + 4] + c[i * 8 + 4];
+       a[i * 8 + 5] = b[i * 8 + 4] + c[i * 8 + 5];
+       a[i * 8 + 6] = b[i * 8 + 6] + c[i * 8 + 6];
+       a[i * 8 + 7] = b[i * 8 + 6] + c[i * 8 + 7];
+     }
+ }
+ 
+ /* { dg-final { scan-assembler {\ttrn1\tz[0-9]+\.b, z[0-9]+\.b, z[0-9]+\.b\n} 
} } */
Index: gcc/testsuite/gcc.target/aarch64/sve/slp_perm_5.c
===================================================================
*** /dev/null   2018-07-26 10:26:13.137955424 +0100
--- gcc/testsuite/gcc.target/aarch64/sve/slp_perm_5.c   2018-08-23 
09:59:35.245682525 +0100
***************
*** 0 ****
--- 1,32 ----
+ /* { dg-do compile } */
+ /* { dg-options "-O2 -ftree-vectorize" } */
+ 
+ #include <stdint.h>
+ 
+ void
+ f (uint8_t *restrict a, uint8_t *restrict b,
+    uint8_t *restrict c, uint8_t *restrict d)
+ {
+   for (int i = 0; i < 100; ++i)
+     {
+       a[i * 8] = c[i * 8] + d[i * 8];
+       a[i * 8 + 1] = c[i * 8] + d[i * 8 + 1];
+       a[i * 8 + 2] = c[i * 8 + 2] + d[i * 8 + 2];
+       a[i * 8 + 3] = c[i * 8 + 2] + d[i * 8 + 3];
+       a[i * 8 + 4] = c[i * 8 + 4] + d[i * 8 + 4];
+       a[i * 8 + 5] = c[i * 8 + 4] + d[i * 8 + 5];
+       a[i * 8 + 6] = c[i * 8 + 6] + d[i * 8 + 6];
+       a[i * 8 + 7] = c[i * 8 + 6] + d[i * 8 + 7];
+       b[i * 8] = c[i * 8 + 1] + d[i * 8];
+       b[i * 8 + 1] = c[i * 8 + 1] + d[i * 8 + 1];
+       b[i * 8 + 2] = c[i * 8 + 3] + d[i * 8 + 2];
+       b[i * 8 + 3] = c[i * 8 + 3] + d[i * 8 + 3];
+       b[i * 8 + 4] = c[i * 8 + 5] + d[i * 8 + 4];
+       b[i * 8 + 5] = c[i * 8 + 5] + d[i * 8 + 5];
+       b[i * 8 + 6] = c[i * 8 + 7] + d[i * 8 + 6];
+       b[i * 8 + 7] = c[i * 8 + 7] + d[i * 8 + 7];
+     }
+ }
+ 
+ /* { dg-final { scan-assembler {\ttrn1\tz[0-9]+\.b, z[0-9]+\.b, z[0-9]+\.b\n} 
} } */
+ /* { dg-final { scan-assembler {\ttrn2\tz[0-9]+\.b, z[0-9]+\.b, z[0-9]+\.b\n} 
} } */
Index: gcc/testsuite/gcc.target/aarch64/sve/slp_perm_6.c
===================================================================
*** /dev/null   2018-07-26 10:26:13.137955424 +0100
--- gcc/testsuite/gcc.target/aarch64/sve/slp_perm_6.c   2018-08-23 
09:59:35.245682525 +0100
***************
*** 0 ****
--- 1,22 ----
+ /* { dg-do compile } */
+ /* { dg-options "-O2 -ftree-vectorize" } */
+ 
+ #include <stdint.h>
+ 
+ void
+ f (uint8_t *restrict a, uint8_t *restrict b)
+ {
+   for (int i = 0; i < 100; ++i)
+     {
+       a[i * 8] = b[i * 8 + 3] + 1;
+       a[i * 8 + 1] = b[i * 8 + 6] + 1;
+       a[i * 8 + 2] = b[i * 8 + 0] + 1;
+       a[i * 8 + 3] = b[i * 8 + 2] + 1;
+       a[i * 8 + 4] = b[i * 8 + 1] + 1;
+       a[i * 8 + 5] = b[i * 8 + 7] + 1;
+       a[i * 8 + 6] = b[i * 8 + 5] + 1;
+       a[i * 8 + 7] = b[i * 8 + 4] + 1;
+     }
+ }
+ 
+ /* { dg-final { scan-assembler-times {\ttbl\tz[0-9]+\.b, z[0-9]+\.b, 
z[0-9]+\.b\n} 1 } } */
Index: gcc/testsuite/gcc.target/aarch64/sve/slp_perm_7.c
===================================================================
*** /dev/null   2018-07-26 10:26:13.137955424 +0100
--- gcc/testsuite/gcc.target/aarch64/sve/slp_perm_7.c   2018-08-23 
09:59:35.245682525 +0100
***************
*** 0 ****
--- 1,22 ----
+ /* { dg-do compile } */
+ /* { dg-options "-O2 -ftree-vectorize" } */
+ 
+ #include <stdint.h>
+ 
+ void
+ f (uint8_t *restrict a, uint8_t *restrict b)
+ {
+   for (int i = 0; i < 100; ++i)
+     {
+       a[i * 8] = b[i * 8 + 1] + 1;
+       a[i * 8 + 1] = b[i * 8 + 7] + 2;
+       a[i * 8 + 2] = b[i * 8 + 1] + 3;
+       a[i * 8 + 3] = b[i * 8 + 7] + 4;
+       a[i * 8 + 4] = b[i * 8 + 1] + 5;
+       a[i * 8 + 5] = b[i * 8 + 7] + 6;
+       a[i * 8 + 6] = b[i * 8 + 1] + 7;
+       a[i * 8 + 7] = b[i * 8 + 7] + 8;
+     }
+ }
+ 
+ /* { dg-final { scan-assembler {\ttbl\tz[0-9]+\.b, z[0-9]+\.b, z[0-9]+\.b\n} 
} } */

Reply via email to