On Wed, Jun 17, 2015 at 5:32 PM, Michael Matz <[email protected]> wrote:
> Hi,
>
> this implements support for strided grouped stores in the non-SLP case
> (the SLP case existed already). Before we were ignoring all but the last
> store in a group. That led to a miscompile of GemsFDTD, the testcase
> reflects that situation.
>
> Also since r224511 yesterday grouped strided non-SLP loads were broken,
> all loads in a group were using the same base address, which is okay only
> for the SLP case, as the code is structured right now (only the SLP case
> uses the permutation path, non-SLP emits scalar loads directly).
>
> Regstrapping on x86-64-linux in progress, okay if that passes?
Ok.
Thanks,
Richard.
>
> Ciao,
> Michael.
> PR middle-end/66253
> * tree-vect-stmts.c (vectorizable_store): Implement non-SLP
> grouped strided stores.
> (vectorizable_load): Don't use the DR from first_stmt in
> the non-SLP grouped strided case.
>
> testsuite/
> * gcc.dg/vect/pr66253.c: New testcase.
>
> Index: tree-vect-stmts.c
> ===================================================================
> --- tree-vect-stmts.c (revision 224562)
> +++ tree-vect-stmts.c (working copy)
> @@ -5262,16 +5262,17 @@ vectorizable_store (gimple stmt, gimple_
> gimple_seq stmts = NULL;
> tree stride_base, stride_step, alias_off;
> tree vec_oprnd;
> + unsigned int g;
>
> gcc_assert (!nested_in_vect_loop_p (loop, stmt));
>
> stride_base
> = fold_build_pointer_plus
> - (unshare_expr (DR_BASE_ADDRESS (dr)),
> + (unshare_expr (DR_BASE_ADDRESS (first_dr)),
> size_binop (PLUS_EXPR,
> - convert_to_ptrofftype (unshare_expr (DR_OFFSET
> (dr))),
> - convert_to_ptrofftype (DR_INIT(dr))));
> - stride_step = fold_convert (sizetype, unshare_expr (DR_STEP (dr)));
> + convert_to_ptrofftype (unshare_expr (DR_OFFSET
> (first_dr))),
> + convert_to_ptrofftype (DR_INIT(first_dr))));
> + stride_step = fold_convert (sizetype, unshare_expr (DR_STEP
> (first_dr)));
>
> /* For a store with loop-invariant (but other than power-of-2)
> stride (i.e. not a grouped access) like so:
> @@ -5302,6 +5303,7 @@ vectorizable_store (gimple stmt, gimple_
> ltype = vectype;
> ltype = build_aligned_type (ltype, TYPE_ALIGN (elem_type));
> ncopies = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
> + group_size = 1;
> }
>
> ivstep = stride_step;
> @@ -5322,65 +5324,89 @@ vectorizable_store (gimple stmt, gimple_
> gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
>
> prev_stmt_info = NULL;
> - running_off = offvar;
> - alias_off = build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0);
> - for (j = 0; j < ncopies; j++)
> + alias_off = build_int_cst (reference_alias_ptr_type (DR_REF
> (first_dr)), 0);
> + next_stmt = first_stmt;
> + for (g = 0; g < group_size; g++)
> {
> - /* We've set op and dt above, from gimple_assign_rhs1(stmt),
> - and first_stmt == stmt. */
> - if (j == 0)
> - {
> - if (slp)
> - {
> - vect_get_vec_defs (op, NULL_TREE, stmt, &vec_oprnds, NULL,
> - slp_node, -1);
> - vec_oprnd = vec_oprnds[0];
> - }
> - else
> - vec_oprnd = vect_get_vec_def_for_operand (op, first_stmt,
> NULL);
> - }
> - else
> + running_off = offvar;
> + if (g)
> {
> - if (slp)
> - vec_oprnd = vec_oprnds[j];
> - else
> - vec_oprnd = vect_get_vec_def_for_stmt_copy (dt, vec_oprnd);
> - }
> -
> - for (i = 0; i < nstores; i++)
> - {
> - tree newref, newoff;
> - gimple incr, assign;
> - tree size = TYPE_SIZE (ltype);
> - /* Extract the i'th component. */
> - tree pos = fold_build2 (MULT_EXPR, bitsizetype, bitsize_int (i),
> + tree size = TYPE_SIZE_UNIT (ltype);
> + tree pos = fold_build2 (MULT_EXPR, sizetype, size_int (g),
> size);
> - tree elem = fold_build3 (BIT_FIELD_REF, ltype, vec_oprnd,
> - size, pos);
> -
> - elem = force_gimple_operand_gsi (gsi, elem, true,
> - NULL_TREE, true,
> - GSI_SAME_STMT);
> -
> - newref = build2 (MEM_REF, ltype,
> - running_off, alias_off);
> -
> - /* And store it to *running_off. */
> - assign = gimple_build_assign (newref, elem);
> - vect_finish_stmt_generation (stmt, assign, gsi);
> -
> - newoff = copy_ssa_name (running_off, NULL);
> + tree newoff = copy_ssa_name (running_off, NULL);
> incr = gimple_build_assign (newoff, POINTER_PLUS_EXPR,
> - running_off, stride_step);
> + running_off, pos);
> vect_finish_stmt_generation (stmt, incr, gsi);
> -
> running_off = newoff;
> - if (j == 0 && i == 0)
> - STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = assign;
> + }
> + for (j = 0; j < ncopies; j++)
> + {
> + /* We've set op and dt above, from gimple_assign_rhs1(stmt),
> + and first_stmt == stmt. */
> + if (j == 0)
> + {
> + if (slp)
> + {
> + vect_get_vec_defs (op, NULL_TREE, stmt, &vec_oprnds,
> NULL,
> + slp_node, -1);
> + vec_oprnd = vec_oprnds[0];
> + }
> + else
> + {
> + gcc_assert (gimple_assign_single_p (next_stmt));
> + op = gimple_assign_rhs1 (next_stmt);
> + vec_oprnd = vect_get_vec_def_for_operand (op, next_stmt,
> + NULL);
> + }
> + }
> else
> - STMT_VINFO_RELATED_STMT (prev_stmt_info) = assign;
> - prev_stmt_info = vinfo_for_stmt (assign);
> + {
> + if (slp)
> + vec_oprnd = vec_oprnds[j];
> + else
> + vec_oprnd = vect_get_vec_def_for_stmt_copy (dt,
> vec_oprnd);
> + }
> +
> + for (i = 0; i < nstores; i++)
> + {
> + tree newref, newoff;
> + gimple incr, assign;
> + tree size = TYPE_SIZE (ltype);
> + /* Extract the i'th component. */
> + tree pos = fold_build2 (MULT_EXPR, bitsizetype,
> + bitsize_int (i), size);
> + tree elem = fold_build3 (BIT_FIELD_REF, ltype, vec_oprnd,
> + size, pos);
> +
> + elem = force_gimple_operand_gsi (gsi, elem, true,
> + NULL_TREE, true,
> + GSI_SAME_STMT);
> +
> + newref = build2 (MEM_REF, ltype,
> + running_off, alias_off);
> +
> + /* And store it to *running_off. */
> + assign = gimple_build_assign (newref, elem);
> + vect_finish_stmt_generation (stmt, assign, gsi);
> +
> + newoff = copy_ssa_name (running_off, NULL);
> + incr = gimple_build_assign (newoff, POINTER_PLUS_EXPR,
> + running_off, stride_step);
> + vect_finish_stmt_generation (stmt, incr, gsi);
> +
> + running_off = newoff;
> + if (g == group_size - 1)
> + {
> + if (j == 0 && i == 0)
> + STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = assign;
> + else
> + STMT_VINFO_RELATED_STMT (prev_stmt_info) = assign;
> + prev_stmt_info = vinfo_for_stmt (assign);
> + }
> + }
> }
> + next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
> }
> return true;
> }
> @@ -6265,7 +6291,7 @@ vectorizable_load (gimple stmt, gimple_s
>
> gcc_assert (!nested_in_vect_loop);
>
> - if (grouped_load)
> + if (slp && grouped_load)
> first_dr = STMT_VINFO_DATA_REF
> (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
> else
> Index: testsuite/gcc.dg/vect/pr66253.c
> ===================================================================
> --- testsuite/gcc.dg/vect/pr66253.c (revision 0)
> +++ testsuite/gcc.dg/vect/pr66253.c (working copy)
> @@ -0,0 +1,51 @@
> +/* { dg-require-effective-target vect_double } */
> +/* { dg-require-effective-target vect_hw_misalign } */
> +
> +#include "tree-vect.h"
> +
> +void __attribute__((noinline,noclone))
> +test1(_Complex double * __restrict__ a, _Complex double * __restrict__ b,
> + double * __restrict__ c, int stride, int n)
> +{
> + int i;
> + for (i = 0; i < n; i++)
> + {
> + a[i*stride] = 0.5 * b[i*stride] * c[i*stride];
> + }
> +}
> +
> +double ca[256];
> +_Complex double ia[256];
> +_Complex double da[256];
> +
> +extern void abort (void);
> +
> +int main ()
> +{
> + int i;
> + int stride;
> +
> + check_vect ();
> +
> + for (stride = 1; stride < 15; stride++)
> + {
> + for (i = 0; i < 256; i++)
> + {
> + __real__ ia[i] = (i + stride) % 19;
> + __imag__ ia[i] = (i + stride) % 23;
> + ca[i] = (i + stride) % 29;
> + __asm__ volatile ("");
> + }
> +
> + test1(da, ia, ca, stride, 256/stride);
> +
> + for (i = 0; i < 256/stride; i++)
> + {
> + if (da[i*stride] != 0.5 * ia[i*stride] * ca[i*stride])
> + abort ();
> + }
> + }
> + return 0;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */