This patch tries to use gather loads for strided accesses,
rather than falling back to VMAT_ELEMENTWISE.

Tested on aarch64-linux-gnu (with and without SVE), x86_64-linux-gnu
and powerpc64le-linux-gnu.  OK to install?

Richard


2017-11-17  Richard Sandiford  <richard.sandif...@linaro.org>
            Alan Hayward  <alan.hayw...@arm.com>
            David Sherwood  <david.sherw...@arm.com>

gcc/
        * tree-vectorizer.h (vect_create_data_ref_ptr): Take an extra
        optional tree argument.
        * tree-vect-data-refs.c (vect_create_data_ref_ptr): Take the
        iv_step as an optional argument, but continue to use the current
        value as a fallback.
        (bump_vector_ptr): Use operand_equal_p rather than tree_int_cst_compare
        to compare the updates.
        * tree-vect-stmts.c (vect_use_strided_gather_scatters_p): New function.
        (get_load_store_type): Use it when handling a strided access.
        (vect_get_strided_load_store_ops): New function.
        (vect_get_data_ptr_increment): Likewise.
        (vectorizable_load): Handle strided gather loads.  Always pass
        a step to vect_create_data_ref_ptr and bump_vector_ptr.

gcc/testsuite/
        * gcc.target/aarch64/sve_strided_load_1.c: New test.
        * gcc.target/aarch64/sve_strided_load_2.c: Likewise.
        * gcc.target/aarch64/sve_strided_load_3.c: Likewise.

Index: gcc/tree-vectorizer.h
===================================================================
--- gcc/tree-vectorizer.h       2017-11-17 21:57:43.920003721 +0000
+++ gcc/tree-vectorizer.h       2017-11-17 21:59:27.828803892 +0000
@@ -1461,7 +1461,7 @@ extern void vect_record_base_alignments
 extern tree vect_create_data_ref_ptr (gimple *, tree, struct loop *, tree,
                                      tree *, gimple_stmt_iterator *,
                                      gimple **, bool, bool *,
-                                     tree = NULL_TREE);
+                                     tree = NULL_TREE, tree = NULL_TREE);
 extern tree bump_vector_ptr (tree, gimple *, gimple_stmt_iterator *, gimple *,
                             tree);
 extern tree vect_create_destination_var (tree, tree);
Index: gcc/tree-vect-data-refs.c
===================================================================
--- gcc/tree-vect-data-refs.c   2017-11-17 21:57:43.919003822 +0000
+++ gcc/tree-vect-data-refs.c   2017-11-17 21:59:27.827803892 +0000
@@ -4362,6 +4362,10 @@ vect_create_addr_base_for_vector_ref (gi
        to the initial address accessed by the data-ref in STMT.  This is
        similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
        in bytes.
+   8. IV_STEP (optional, defaults to NULL): the amount that should be added
+       to the IV during each iteration of the loop.  NULL says to move
+       by one copy of AGGR_TYPE up or down, depending on the step of the
+       data reference.
 
    Output:
    1. Declare a new ptr to vector_type, and have it point to the base of the
@@ -4394,7 +4398,8 @@ vect_create_addr_base_for_vector_ref (gi
 vect_create_data_ref_ptr (gimple *stmt, tree aggr_type, struct loop *at_loop,
                          tree offset, tree *initial_address,
                          gimple_stmt_iterator *gsi, gimple **ptr_incr,
-                         bool only_init, bool *inv_p, tree byte_offset)
+                         bool only_init, bool *inv_p, tree byte_offset,
+                         tree iv_step)
 {
   const char *base_name;
   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
@@ -4418,7 +4423,8 @@ vect_create_data_ref_ptr (gimple *stmt,
   tree step;
   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
 
-  gcc_assert (TREE_CODE (aggr_type) == ARRAY_TYPE
+  gcc_assert (iv_step != NULL_TREE
+             || TREE_CODE (aggr_type) == ARRAY_TYPE
              || TREE_CODE (aggr_type) == VECTOR_TYPE);
 
   if (loop_vinfo)
@@ -4559,14 +4565,17 @@ vect_create_data_ref_ptr (gimple *stmt,
     aptr = aggr_ptr_init;
   else
     {
-      /* The step of the aggregate pointer is the type size.  */
-      tree iv_step = TYPE_SIZE_UNIT (aggr_type);
-      /* One exception to the above is when the scalar step of the load in
-        LOOP is zero. In this case the step here is also zero.  */
-      if (*inv_p)
-       iv_step = size_zero_node;
-      else if (tree_int_cst_sgn (step) == -1)
-       iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
+      if (iv_step == NULL_TREE)
+       {
+         /* The step of the aggregate pointer is the type size.  */
+         iv_step = TYPE_SIZE_UNIT (aggr_type);
+         /* One exception to the above is when the scalar step of the load in
+            LOOP is zero. In this case the step here is also zero.  */
+         if (*inv_p)
+           iv_step = size_zero_node;
+         else if (tree_int_cst_sgn (step) == -1)
+           iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
+       }
 
       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
 
@@ -4699,7 +4708,7 @@ bump_vector_ptr (tree dataref_ptr, gimpl
       if (use == dataref_ptr)
         SET_USE (use_p, new_dataref_ptr);
       else
-        gcc_assert (tree_int_cst_compare (use, update) == 0);
+        gcc_assert (operand_equal_p (use, update, 0));
     }
 
   return new_dataref_ptr;
Index: gcc/tree-vect-stmts.c
===================================================================
--- gcc/tree-vect-stmts.c       2017-11-17 21:57:43.920003721 +0000
+++ gcc/tree-vect-stmts.c       2017-11-17 21:59:27.828803892 +0000
@@ -1847,6 +1847,43 @@ prepare_load_store_mask (tree mask_type,
   return and_res;
 }
 
+/* Return true if we can use gather/scatter internal functions to
+   vectorize STMT, which is a grouped or strided load or store.
+   When returning true, fill in GS_INFO with the information required
+   to perform the operation.  */
+
+static bool
+vect_use_strided_gather_scatters_p (gimple *stmt, loop_vec_info loop_vinfo,
+                                   gather_scatter_info *gs_info)
+{
+  if (!vect_check_gather_scatter (stmt, loop_vinfo, gs_info))
+    return false;
+
+  scalar_mode element_mode = SCALAR_TYPE_MODE (gs_info->element_type);
+  unsigned int element_bits = GET_MODE_BITSIZE (element_mode);
+  tree offset_type = TREE_TYPE (gs_info->offset);
+  unsigned int offset_bits = TYPE_PRECISION (offset_type);
+
+  /* Enforced by vect_check_gather_scatter.  */
+  gcc_assert (element_bits >= offset_bits);
+
+  /* If the elements are wider than the offset, convert the offset to the
+     same width, without changing its sign.  */
+  if (element_bits > offset_bits)
+    {
+      bool unsigned_p = TYPE_UNSIGNED (offset_type);
+      offset_type = build_nonstandard_integer_type (element_bits, unsigned_p);
+      gs_info->offset = fold_convert (offset_type, gs_info->offset);
+    }
+
+  if (dump_enabled_p ())
+    dump_printf_loc (MSG_NOTE, vect_location,
+                    "using gather/scatter for strided/grouped access,"
+                    " scale = %d\n", gs_info->scale);
+
+  return true;
+}
+
 /* STMT is a non-strided load or store, meaning that it accesses
    elements with a known constant step.  Return -1 if that step
    is negative, 0 if it is zero, and 1 if it is greater than zero.  */
@@ -2200,7 +2237,11 @@ get_load_store_type (gimple *stmt, tree
   else if (STMT_VINFO_STRIDED_P (stmt_info))
     {
       gcc_assert (!slp);
-      *memory_access_type = VMAT_ELEMENTWISE;
+      if (loop_vinfo
+         && vect_use_strided_gather_scatters_p (stmt, loop_vinfo, gs_info))
+       *memory_access_type = VMAT_GATHER_SCATTER;
+      else
+       *memory_access_type = VMAT_ELEMENTWISE;
     }
   else
     {
@@ -2640,6 +2681,71 @@ vect_get_gather_scatter_ops (struct loop
                                              offset_vectype);
 }
 
+/* Prepare to implement a grouped or strided load or store using
+   the gather load or scatter store operation described by GS_INFO.
+   STMT is the load or store statement.
+
+   Set *DATAREF_BUMP to the amount that should be added to the base
+   address after each copy of the vectorized statement.  Set *VEC_OFFSET
+   to an invariant offset vector in which element I has the value
+   I * DR_STEP / SCALE.  */
+
+static void
+vect_get_strided_load_store_ops (gimple *stmt, loop_vec_info loop_vinfo,
+                                gather_scatter_info *gs_info,
+                                tree *dataref_bump, tree *vec_offset)
+{
+  stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
+  struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
+  struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
+  tree vectype = STMT_VINFO_VECTYPE (stmt_info);
+  gimple_seq stmts;
+
+  tree bump = size_binop (MULT_EXPR,
+                         fold_convert (sizetype, DR_STEP (dr)),
+                         size_int (TYPE_VECTOR_SUBPARTS (vectype)));
+  *dataref_bump = force_gimple_operand (bump, &stmts, true, NULL_TREE);
+  if (stmts)
+    gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
+
+  /* The offset given in GS_INFO can have pointer type, so use the element
+     type of the vector instead.  */
+  tree offset_type = TREE_TYPE (gs_info->offset);
+  tree offset_vectype = get_vectype_for_scalar_type (offset_type);
+  offset_type = TREE_TYPE (offset_vectype);
+
+  /* Calculate X = DR_STEP / SCALE and convert it to the appropriate type.  */
+  tree step = size_binop (EXACT_DIV_EXPR, DR_STEP (dr),
+                         ssize_int (gs_info->scale));
+  step = fold_convert (offset_type, step);
+  step = force_gimple_operand (step, &stmts, true, NULL_TREE);
+
+  /* Create {0, X, X*2, X*3, ...}.  */
+  *vec_offset = gimple_build (&stmts, VEC_SERIES_EXPR, offset_vectype,
+                             build_zero_cst (offset_type), step);
+  if (stmts)
+    gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
+}
+
+/* Return the amount that should be added to a vector pointer to move
+   to the next or previous copy of AGGR_TYPE.  DR is the data reference
+   being vectorized and MEMORY_ACCESS_TYPE describes the type of
+   vectorization.  */
+
+static tree
+vect_get_data_ptr_increment (data_reference *dr, tree aggr_type,
+                            vect_memory_access_type memory_access_type)
+{
+  if (memory_access_type == VMAT_INVARIANT)
+    return size_zero_node;
+
+  tree iv_step = TYPE_SIZE_UNIT (aggr_type);
+  tree step = vect_dr_behavior (dr)->step;
+  if (tree_int_cst_sgn (step) == -1)
+    iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
+  return iv_step;
+}
+
 /* Check and perform vectorization of BUILT_IN_BSWAP{16,32,64}.  */
 
 static bool
@@ -7417,6 +7523,9 @@ vectorizable_load (gimple *stmt, gimple_
       return true;
     }
 
+  if (memory_access_type == VMAT_GATHER_SCATTER)
+    grouped_load = false;
+
   if (grouped_load)
     {
       first_stmt = GROUP_FIRST_ELEMENT (stmt_info);
@@ -7628,13 +7737,29 @@ vectorizable_load (gimple *stmt, gimple_
   if (memory_access_type == VMAT_CONTIGUOUS_REVERSE)
     offset = size_int (-TYPE_VECTOR_SUBPARTS (vectype) + 1);
 
-  if (memory_access_type == VMAT_LOAD_STORE_LANES)
-    aggr_type = build_array_type_nelts (elem_type, vec_num * nunits);
+  tree bump;
+  tree vec_offset = NULL_TREE;
+  if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
+    {
+      aggr_type = NULL_TREE;
+      bump = NULL_TREE;
+    }
+  else if (memory_access_type == VMAT_GATHER_SCATTER)
+    {
+      aggr_type = elem_type;
+      vect_get_strided_load_store_ops (stmt, loop_vinfo, &gs_info,
+                                      &bump, &vec_offset);
+    }
   else
-    aggr_type = vectype;
+    {
+      if (memory_access_type == VMAT_LOAD_STORE_LANES)
+       aggr_type = build_array_type_nelts (elem_type, vec_num * nunits);
+      else
+       aggr_type = vectype;
+      bump = vect_get_data_ptr_increment (dr, aggr_type, memory_access_type);
+    }
 
   tree vec_mask = NULL_TREE;
-  tree vec_offset = NULL_TREE;
   prev_stmt_info = NULL;
   poly_uint64 group_elt = 0;
   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
@@ -7666,7 +7791,7 @@ vectorizable_load (gimple *stmt, gimple_
                = vect_create_data_ref_ptr (first_stmt_for_drptr, aggr_type,
                                            at_loop, offset, &dummy, gsi,
                                            &ptr_incr, simd_lane_access_p,
-                                           &inv_p, byte_offset);
+                                           &inv_p, byte_offset, bump);
              /* Adjust the pointer by the difference to first_stmt.  */
              data_reference_p ptrdr
                = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt_for_drptr));
@@ -7688,7 +7813,7 @@ vectorizable_load (gimple *stmt, gimple_
              = vect_create_data_ref_ptr (first_stmt, aggr_type, at_loop,
                                          offset, &dummy, gsi, &ptr_incr,
                                          simd_lane_access_p, &inv_p,
-                                         byte_offset);
+                                         byte_offset, bump);
          if (mask)
            vec_mask = vect_get_vec_def_for_operand (mask, stmt,
                                                     mask_vectype);
@@ -7697,7 +7822,7 @@ vectorizable_load (gimple *stmt, gimple_
        {
          if (dataref_offset)
            dataref_offset = int_const_binop (PLUS_EXPR, dataref_offset,
-                                             TYPE_SIZE_UNIT (aggr_type));
+                                             bump);
          else if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
            {
              gimple *def_stmt;
@@ -7706,8 +7831,8 @@ vectorizable_load (gimple *stmt, gimple_
              vec_offset = vect_get_vec_def_for_stmt_copy (dt, vec_offset);
            }
          else
-           dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, gsi, stmt,
-                                          TYPE_SIZE_UNIT (aggr_type));
+           dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, gsi,
+                                          stmt, bump);
          if (mask)
            {
              gimple *def_stmt;
@@ -7783,7 +7908,7 @@ vectorizable_load (gimple *stmt, gimple_
 
              if (i > 0)
                dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, gsi,
-                                              stmt, NULL_TREE);
+                                              stmt, bump);
 
              /* 2. Create the vector-load in the loop.  */
              switch (alignment_support_scheme)
Index: gcc/testsuite/gcc.target/aarch64/sve_strided_load_1.c
===================================================================
--- /dev/null   2017-11-14 14:28:07.424493901 +0000
+++ gcc/testsuite/gcc.target/aarch64/sve_strided_load_1.c       2017-11-17 
21:59:27.825803893 +0000
@@ -0,0 +1,40 @@
+/* { dg-do assemble } */
+/* { dg-options "-O2 -ftree-vectorize -march=armv8-a+sve --save-temps" } */
+
+#include <stdint.h>
+
+#ifndef INDEX8
+#define INDEX8 int8_t
+#define INDEX16 int16_t
+#define INDEX32 int32_t
+#define INDEX64 int64_t
+#endif
+
+#define TEST_LOOP(DATA_TYPE, BITS)                             \
+  void __attribute__ ((noinline, noclone))                     \
+  f_##DATA_TYPE##_##BITS (DATA_TYPE *restrict dest,            \
+                         DATA_TYPE *restrict src,              \
+                         INDEX##BITS stride, INDEX##BITS n)    \
+  {                                                            \
+    for (INDEX##BITS i = 0; i < n; ++i)                                \
+      dest[i] += src[i * stride];                              \
+  }
+
+#define TEST_TYPE(T, DATA_TYPE)                        \
+  T (DATA_TYPE, 8)                             \
+  T (DATA_TYPE, 16)                            \
+  T (DATA_TYPE, 32)                            \
+  T (DATA_TYPE, 64)
+
+#define TEST_ALL(T)                            \
+  TEST_TYPE (T, int32_t)                       \
+  TEST_TYPE (T, uint32_t)                      \
+  TEST_TYPE (T, float)                         \
+  TEST_TYPE (T, int64_t)                       \
+  TEST_TYPE (T, uint64_t)                      \
+  TEST_TYPE (T, double)
+
+TEST_ALL (TEST_LOOP)
+
+/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, 
z[0-9]+.s, sxtw 2\]\n} 9 } } */
+/* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d, p[0-7]/z, \[x[0-9]+, 
z[0-9]+.d, lsl 3\]\n} 12 } } */
Index: gcc/testsuite/gcc.target/aarch64/sve_strided_load_2.c
===================================================================
--- /dev/null   2017-11-14 14:28:07.424493901 +0000
+++ gcc/testsuite/gcc.target/aarch64/sve_strided_load_2.c       2017-11-17 
21:59:27.826803893 +0000
@@ -0,0 +1,18 @@
+/* { dg-do assemble } */
+/* { dg-options "-O2 -ftree-vectorize -march=armv8-a+sve --save-temps" } */
+
+#define INDEX8 uint8_t
+#define INDEX16 uint16_t
+#define INDEX32 uint32_t
+#define INDEX64 uint64_t
+
+#include "sve_strided_load_1.c"
+
+/* 8 and 16 bits are signed because the multiplication promotes to int.
+   Using uxtw for all 9 would be OK.  */
+/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, 
z[0-9]+.s, sxtw 2\]\n} 6 } } */
+/* The 32-bit loop needs to honor the defined overflow in uint32_t,
+   so we vectorize the offset calculation.  This means that the
+   64-bit version needs two copies.  */
+/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, 
z[0-9]+.s, uxtw 2\]\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d, p[0-7]/z, \[x[0-9]+, 
z[0-9]+.d, lsl 3\]\n} 15 } } */
Index: gcc/testsuite/gcc.target/aarch64/sve_strided_load_3.c
===================================================================
--- /dev/null   2017-11-14 14:28:07.424493901 +0000
+++ gcc/testsuite/gcc.target/aarch64/sve_strided_load_3.c       2017-11-17 
21:59:27.826803893 +0000
@@ -0,0 +1,32 @@
+/* { dg-do assemble } */
+/* { dg-options "-O2 -ftree-vectorize -march=armv8-a+sve --save-temps" } */
+
+#include <stdint.h>
+
+#define TEST_LOOP(DATA_TYPE, OTHER_TYPE)                               \
+  void __attribute__ ((noinline, noclone))                             \
+  f_##DATA_TYPE##_##BITS (DATA_TYPE *restrict dest,                    \
+                         DATA_TYPE *restrict src,                      \
+                         OTHER_TYPE *restrict other,                   \
+                         OTHER_TYPE mask,                              \
+                         int stride, int n)                            \
+  {                                                                    \
+    for (int i = 0; i < n; ++i)                                                
\
+      dest[i] = src[i * stride] + (OTHER_TYPE) (other[i] | mask);      \
+  }
+
+#define TEST_ALL(T)                            \
+  T (int32_t, int16_t)                         \
+  T (uint32_t, int16_t)                                \
+  T (float, int16_t)                           \
+  T (int64_t, int32_t)                         \
+  T (uint64_t, int32_t)                                \
+  T (double, int32_t)
+
+TEST_ALL (TEST_LOOP)
+
+/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.h, p[0-7]/z, \[x[0-9]+, 
x[0-9]+, lsl 1\]\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, 
z[0-9]+.s, sxtw 2\]\n} 6 } } */
+
+/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, 
x[0-9]+, lsl 2\]\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d, p[0-7]/z, \[x[0-9]+, 
z[0-9]+.d, lsl 3\]\n} 6 } } */

Reply via email to