Hi,
I'm sitting on this since quite some time already and always missed stage
1. This implements support for vectorizing strided stores with unknown
but loop invariant stride, like:
sumit (float * __restrict dest,
float * __restrict src, float * __restrict src2,
int stride, int n)
{
int i;
for (i = 0; i < n; i++)
dest[i*stride] = src[i] + src2[i];
}
I use the same scheme like for strided loads, i.e. expanding such store
with N separate scalar stores (so alignment could also be ignored for such
ones).
This doesn't yet fix PR65962, because that one uses a _constant_ step.
That makes us try a grouped access, which isn't supported for stores when
there's a gap (which there is). Unfortunately vect_analyze_group_access
actively changes some vect info even before detecting that it's not a
grouped access, so there must be some rollback implemented; I decided to
defer this to a follow up.
Regstrapped on x86-64-linux, no regressions (I had to adjust two
fortran testcases where one more loop is vectorized now). Okay for trunk?
Ciao,
Michael.
* tree-vectorizer.h (struct _stmt_vec_info): Rename stride_load_p
to strided_p.
(STMT_VINFO_STRIDE_LOAD_P): Rename to ...
(STMT_VINFO_STRIDED_P): ... this.
* tree-vect-data-refs.c (vect_compute_data_ref_alignment): Adjust.
(vect_verify_datarefs_alignment): Likewise.
(vect_enhance_data_refs_alignment): Likewise.
(vect_analyze_data_ref_access): Likewise.
(vect_analyze_data_refs): Accept strided stores.
* tree-vect-stmts.c (vect_model_store_cost): Count strided stores.
(vect_model_load_cost): Adjust for macro rename.
(vectorizable_mask_load_store): Likewise.
(vectorizable_load): Likewise.
(vectorizable_store): Open code strided stores.
testsuite/
* gcc.dg/vect/vect-strided-store.c: New test.
* gfortran.dg/vect/fast-math-pr37021.f90: Adjust.
* gfortran.dg/vect/fast-math-rnflow-trs2a2.f90: Adjust.
diff --git a/gcc/tree-vect-data-refs.c b/gcc/tree-vect-data-refs.c
index 96afc7a..6d8f17e 100644
--- a/gcc/tree-vect-data-refs.c
+++ b/gcc/tree-vect-data-refs.c
@@ -665,7 +665,7 @@ vect_compute_data_ref_alignment (struct data_reference *dr)
/* Strided loads perform only component accesses, misalignment information
is irrelevant for them. */
- if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
+ if (STMT_VINFO_STRIDED_P (stmt_info))
return true;
misalign = DR_INIT (dr);
@@ -942,7 +942,7 @@ vect_verify_datarefs_alignment (loop_vec_info loop_vinfo,
bb_vec_info bb_vinfo)
/* Strided loads perform only component accesses, alignment is
irrelevant for them. */
- if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
+ if (STMT_VINFO_STRIDED_P (stmt_info))
continue;
supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
@@ -1409,7 +1409,7 @@ vect_enhance_data_refs_alignment (loop_vec_info
loop_vinfo)
/* Strided loads perform only component accesses, alignment is
irrelevant for them. */
- if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
+ if (STMT_VINFO_STRIDED_P (stmt_info))
continue;
supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
@@ -1701,7 +1701,7 @@ vect_enhance_data_refs_alignment (loop_vec_info
loop_vinfo)
/* Strided loads perform only component accesses, alignment is
irrelevant for them. */
- if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
+ if (STMT_VINFO_STRIDED_P (stmt_info))
continue;
save_misalignment = DR_MISALIGNMENT (dr);
@@ -1821,7 +1821,7 @@ vect_enhance_data_refs_alignment (loop_vec_info
loop_vinfo)
/* Strided loads perform only component accesses, alignment is
irrelevant for them. */
- if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
+ if (STMT_VINFO_STRIDED_P (stmt_info))
continue;
supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
@@ -2368,7 +2368,7 @@ vect_analyze_data_ref_access (struct data_reference *dr)
/* Assume this is a DR handled by non-constant strided load case. */
if (TREE_CODE (step) != INTEGER_CST)
- return STMT_VINFO_STRIDE_LOAD_P (stmt_info);
+ return STMT_VINFO_STRIDED_P (stmt_info);
/* Not consecutive access - check if it's a part of interleaving group. */
return vect_analyze_group_access (dr);
@@ -3764,8 +3764,7 @@ again:
else if (loop_vinfo
&& TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
{
- if (nested_in_vect_loop_p (loop, stmt)
- || !DR_IS_READ (dr))
+ if (nested_in_vect_loop_p (loop, stmt))
{
if (dump_enabled_p ())
{
@@ -3777,7 +3776,7 @@ again:
}
return false;
}
- STMT_VINFO_STRIDE_LOAD_P (stmt_info) = true;
+ STMT_VINFO_STRIDED_P (stmt_info) = true;
}
}
diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
index 2ce6d4d..d268eb0 100644
--- a/gcc/tree-vect-stmts.c
+++ b/gcc/tree-vect-stmts.c
@@ -1014,7 +1014,19 @@ vect_model_store_cost (stmt_vec_info stmt_info, int
ncopies,
}
/* Costs of the stores. */
- vect_get_store_cost (first_dr, ncopies, &inside_cost, body_cost_vec);
+ if (STMT_VINFO_STRIDED_P (stmt_info))
+ {
+ /* N scalar stores plus extracting the elements. */
+ tree vectype = STMT_VINFO_VECTYPE (stmt_info);
+ inside_cost += record_stmt_cost (body_cost_vec,
+ ncopies * TYPE_VECTOR_SUBPARTS (vectype),
+ scalar_store, stmt_info, 0, vect_body);
+ inside_cost += record_stmt_cost (body_cost_vec,
+ ncopies * TYPE_VECTOR_SUBPARTS (vectype),
+ vec_to_scalar, stmt_info, 0, vect_body);
+ }
+ else
+ vect_get_store_cost (first_dr, ncopies, &inside_cost, body_cost_vec);
if (dump_enabled_p ())
dump_printf_loc (MSG_NOTE, vect_location,
@@ -1127,7 +1139,7 @@ vect_model_load_cost (stmt_vec_info stmt_info, int
ncopies,
}
/* The loads themselves. */
- if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
+ if (STMT_VINFO_STRIDED_P (stmt_info))
{
/* N scalar loads plus gathering them into a vector. */
tree vectype = STMT_VINFO_VECTYPE (stmt_info);
@@ -1820,7 +1832,7 @@ vectorizable_mask_load_store (gimple stmt,
gimple_stmt_iterator *gsi,
if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
return false;
- if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
+ if (STMT_VINFO_STRIDED_P (stmt_info))
return false;
if (STMT_VINFO_GATHER_P (stmt_info))
@@ -5013,7 +5025,7 @@ vectorizable_store (gimple stmt, gimple_stmt_iterator
*gsi, gimple *vec_stmt,
tree dataref_ptr = NULL_TREE;
tree dataref_offset = NULL_TREE;
gimple ptr_incr = NULL;
- int nunits = TYPE_VECTOR_SUBPARTS (vectype);
+ unsigned int nunits = TYPE_VECTOR_SUBPARTS (vectype);
int ncopies;
int j;
gimple next_stmt, first_stmt = NULL;
@@ -5100,38 +5112,42 @@ vectorizable_store (gimple stmt, gimple_stmt_iterator
*gsi, gimple *vec_stmt,
if (!STMT_VINFO_DATA_REF (stmt_info))
return false;
- negative =
- tree_int_cst_compare (loop && nested_in_vect_loop_p (loop, stmt)
- ? STMT_VINFO_DR_STEP (stmt_info) : DR_STEP (dr),
- size_zero_node) < 0;
- if (negative && ncopies > 1)
- {
- if (dump_enabled_p ())
- dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
- "multiple types with negative step.\n");
- return false;
- }
-
- if (negative)
+ if (STMT_VINFO_STRIDED_P (stmt_info))
+ ;
+ else
{
- gcc_assert (!grouped_store);
- alignment_support_scheme = vect_supportable_dr_alignment (dr, false);
- if (alignment_support_scheme != dr_aligned
- && alignment_support_scheme != dr_unaligned_supported)
+ negative =
+ tree_int_cst_compare (loop && nested_in_vect_loop_p (loop, stmt)
+ ? STMT_VINFO_DR_STEP (stmt_info) : DR_STEP (dr),
+ size_zero_node) < 0;
+ if (negative && ncopies > 1)
{
if (dump_enabled_p ())
dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
- "negative step but alignment required.\n");
+ "multiple types with negative step.\n");
return false;
}
- if (dt != vect_constant_def
- && dt != vect_external_def
- && !perm_mask_for_reverse (vectype))
+ if (negative)
{
- if (dump_enabled_p ())
- dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
- "negative step and reversing not supported.\n");
- return false;
+ gcc_assert (!grouped_store);
+ alignment_support_scheme = vect_supportable_dr_alignment (dr, false);
+ if (alignment_support_scheme != dr_aligned
+ && alignment_support_scheme != dr_unaligned_supported)
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "negative step but alignment required.\n");
+ return false;
+ }
+ if (dt != vect_constant_def
+ && dt != vect_external_def
+ && !perm_mask_for_reverse (vectype))
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "negative step and reversing not
supported.\n");
+ return false;
+ }
}
}
@@ -5230,6 +5246,113 @@ vectorizable_store (gimple stmt, gimple_stmt_iterator
*gsi, gimple *vec_stmt,
dump_printf_loc (MSG_NOTE, vect_location,
"transform store. ncopies = %d\n", ncopies);
+ if (STMT_VINFO_STRIDED_P (stmt_info))
+ {
+ gimple_stmt_iterator incr_gsi;
+ bool insert_after;
+ gimple incr;
+ tree offvar;
+ tree ivstep;
+ tree running_off;
+ gimple_seq stmts = NULL;
+ tree stride_base, stride_step, alias_off;
+ tree vec_oprnd;
+
+ gcc_assert (!nested_in_vect_loop_p (loop, stmt));
+
+ stride_base
+ = fold_build_pointer_plus
+ (unshare_expr (DR_BASE_ADDRESS (dr)),
+ size_binop (PLUS_EXPR,
+ convert_to_ptrofftype (unshare_expr (DR_OFFSET (dr))),
+ convert_to_ptrofftype (DR_INIT(dr))));
+ stride_step = fold_convert (sizetype, unshare_expr (DR_STEP (dr)));
+
+ /* For a store with loop-invariant (but other than power-of-2)
+ stride (i.e. not a grouped access) like so:
+
+ for (i = 0; i < n; i += stride)
+ array[i] = ...;
+
+ we generate a new induction variable and new stores from
+ the components of the (vectorized) rhs:
+
+ for (j = 0; ; j += VF*stride)
+ vectemp = ...;
+ tmp1 = vectemp[0];
+ array[j] = tmp1;
+ tmp2 = vectemp[1];
+ array[j + stride] = tmp2;
+ ...
+ */
+
+ ivstep = stride_step;
+ ivstep = fold_build2 (MULT_EXPR, TREE_TYPE (ivstep), ivstep,
+ build_int_cst (TREE_TYPE (ivstep),
+ ncopies * nunits));
+
+ standard_iv_increment_position (loop, &incr_gsi, &insert_after);
+
+ create_iv (stride_base, ivstep, NULL,
+ loop, &incr_gsi, insert_after,
+ &offvar, NULL);
+ incr = gsi_stmt (incr_gsi);
+ set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo, NULL));
+
+ stride_step = force_gimple_operand (stride_step, &stmts, true,
NULL_TREE);
+ if (stmts)
+ gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
+
+ prev_stmt_info = NULL;
+ running_off = offvar;
+ alias_off = build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0);
+ for (j = 0; j < ncopies; j++)
+ {
+ /* We've set op and dt above, from gimple_assign_rhs1(stmt),
+ and first_stmt == stmt. */
+ if (j == 0)
+ vec_oprnd = vect_get_vec_def_for_operand (op, first_stmt, NULL);
+ else
+ vec_oprnd = vect_get_vec_def_for_stmt_copy (dt, vec_oprnd);
+
+ for (i = 0; i < nunits; i++)
+ {
+ tree newref, newoff;
+ gimple incr, assign;
+ tree size = TYPE_SIZE (elem_type);
+ /* Extract the i'th component. */
+ tree pos = fold_build2 (MULT_EXPR, bitsizetype, bitsize_int (i),
+ size);
+ tree elem = fold_build3 (BIT_FIELD_REF, elem_type, vec_oprnd,
+ size, pos);
+
+ elem = force_gimple_operand_gsi (gsi, elem, true,
+ NULL_TREE, true,
+ GSI_SAME_STMT);
+
+ newref = build2 (MEM_REF, TREE_TYPE (vectype),
+ running_off, alias_off);
+
+ /* And store it to *running_off. */
+ assign = gimple_build_assign (newref, elem);
+ vect_finish_stmt_generation (stmt, assign, gsi);
+
+ newoff = copy_ssa_name (running_off, NULL);
+ incr = gimple_build_assign (newoff, POINTER_PLUS_EXPR,
+ running_off, stride_step);
+ vect_finish_stmt_generation (stmt, incr, gsi);
+
+ running_off = newoff;
+ if (j == 0 && i == i)
+ STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = assign;
+ else
+ STMT_VINFO_RELATED_STMT (prev_stmt_info) = assign;
+ prev_stmt_info = vinfo_for_stmt (assign);
+ }
+ }
+ return true;
+ }
+
dr_chain.create (group_size);
oprnds.create (group_size);
@@ -5846,7 +5969,7 @@ vectorizable_load (gimple stmt, gimple_stmt_iterator
*gsi, gimple *vec_stmt,
return false;
}
}
- else if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
+ else if (STMT_VINFO_STRIDED_P (stmt_info))
;
else
{
@@ -6079,7 +6202,7 @@ vectorizable_load (gimple stmt, gimple_stmt_iterator
*gsi, gimple *vec_stmt,
}
return true;
}
- else if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
+ else if (STMT_VINFO_STRIDED_P (stmt_info))
{
gimple_stmt_iterator incr_gsi;
bool insert_after;
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index 0796cc1..d231626 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -643,7 +643,9 @@ typedef struct _stmt_vec_info {
/* For loads only, true if this is a gather load. */
bool gather_p;
- bool stride_load_p;
+
+ /* True if this is an access with loop-invariant stride. */
+ bool strided_p;
/* For both loads and stores. */
bool simd_lane_access_p;
@@ -661,7 +663,7 @@ typedef struct _stmt_vec_info {
#define STMT_VINFO_VECTORIZABLE(S) (S)->vectorizable
#define STMT_VINFO_DATA_REF(S) (S)->data_ref_info
#define STMT_VINFO_GATHER_P(S) (S)->gather_p
-#define STMT_VINFO_STRIDE_LOAD_P(S) (S)->stride_load_p
+#define STMT_VINFO_STRIDED_P(S) (S)->strided_p
#define STMT_VINFO_SIMD_LANE_ACCESS_P(S) (S)->simd_lane_access_p
#define STMT_VINFO_DR_BASE_ADDRESS(S) (S)->dr_base_address
diff --git a/gcc/testsuite/gfortran.dg/vect/fast-math-pr37021.f90
b/gcc/testsuite/gfortran.dg/vect/fast-math-pr37021.f90
index b17ac9c..d5f5d40 100644
--- a/gcc/testsuite/gfortran.dg/vect/fast-math-pr37021.f90
+++ b/gcc/testsuite/gfortran.dg/vect/fast-math-pr37021.f90
@@ -14,5 +14,5 @@ subroutine to_product_of(self,a,b,a1,a2)
end do
end subroutine
-! { dg-final { scan-tree-dump "vectorized 1 loops" "vect" } }
+! { dg-final { scan-tree-dump "vectorized 2 loops" "vect" } }
! { dg-final { cleanup-tree-dump "vect" } }
diff --git a/gcc/testsuite/gfortran.dg/vect/fast-math-rnflow-trs2a2.f90
b/gcc/testsuite/gfortran.dg/vect/fast-math-rnflow-trs2a2.f90
index 1d13cea..625be83 100644
--- a/gcc/testsuite/gfortran.dg/vect/fast-math-rnflow-trs2a2.f90
+++ b/gcc/testsuite/gfortran.dg/vect/fast-math-rnflow-trs2a2.f90
@@ -29,5 +29,5 @@
return
end function trs2a2
-! { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } }
+! { dg-final { scan-tree-dump-times "vectorized 2 loops" 1 "vect" } }
! { dg-final { cleanup-tree-dump "vect" } }
diff --git a/gcc/testsuite/gcc.dg/vect/vect-strided-store.c
b/gcc/testsuite/gcc.dg/vect/vect-strided-store.c
new file mode 100644
index 0000000..32bcff9
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-strided-store.c
@@ -0,0 +1,30 @@
+/* { dg-require-effective-target vect_float } */
+
+#include <stdarg.h>
+#include "tree-vect.h"
+
+void __attribute__((noinline))
+sumit (float * __restrict dest,
+ float * __restrict src, float * __restrict src2,
+ int stride, int n)
+{
+ int i;
+ for (i = 0; i < n; i++)
+ dest[i*stride] = src[i] + src2[i];
+}
+
+int main()
+{
+ int i;
+ float src[] = {1, 2, 3, 4, 5, 6, 7, 8};
+ float dest[8];
+ check_vect ();
+ sumit (dest, src, src, 1, 8);
+ for (i = 0; i < 8; i++)
+ if (2*src[i] != dest[i])
+ abort ();
+ return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */