On Wed, Jun 17, 2015 at 5:32 PM, Michael Matz <m...@suse.de> wrote:
> Hi,
>
> this implements support for strided grouped stores in the non-SLP case
> (the SLP case existed already).  Before we were ignoring all but the last
> store in a group.  That led to a miscompile of GemsFDTD, the testcase
> reflects that situation.
>
> Also since r224511 yesterday grouped strided non-SLP loads were broken,
> all loads in a group were using the same base address, which is okay only
> for the SLP case, as the code is structured right now (only the SLP case
> uses the permutation path, non-SLP emits scalar loads directly).
>
> Regstrapping on x86-64-linux in progress, okay if that passes?

Ok.

Thanks,
Richard.

>
> Ciao,
> Michael.
>         PR middle-end/66253
>         * tree-vect-stmts.c (vectorizable_store): Implement non-SLP
>         grouped strided stores.
>         (vectorizable_load): Don't use the DR from first_stmt in
>         the non-SLP grouped strided case.
>
> testsuite/
>         * gcc.dg/vect/pr66253.c: New testcase.
>
> Index: tree-vect-stmts.c
> ===================================================================
> --- tree-vect-stmts.c   (revision 224562)
> +++ tree-vect-stmts.c   (working copy)
> @@ -5262,16 +5262,17 @@ vectorizable_store (gimple stmt, gimple_
>        gimple_seq stmts = NULL;
>        tree stride_base, stride_step, alias_off;
>        tree vec_oprnd;
> +      unsigned int g;
>
>        gcc_assert (!nested_in_vect_loop_p (loop, stmt));
>
>        stride_base
>         = fold_build_pointer_plus
> -           (unshare_expr (DR_BASE_ADDRESS (dr)),
> +           (unshare_expr (DR_BASE_ADDRESS (first_dr)),
>              size_binop (PLUS_EXPR,
> -                        convert_to_ptrofftype (unshare_expr (DR_OFFSET 
> (dr))),
> -                        convert_to_ptrofftype (DR_INIT(dr))));
> -      stride_step = fold_convert (sizetype, unshare_expr (DR_STEP (dr)));
> +                        convert_to_ptrofftype (unshare_expr (DR_OFFSET 
> (first_dr))),
> +                        convert_to_ptrofftype (DR_INIT(first_dr))));
> +      stride_step = fold_convert (sizetype, unshare_expr (DR_STEP 
> (first_dr)));
>
>        /* For a store with loop-invariant (but other than power-of-2)
>           stride (i.e. not a grouped access) like so:
> @@ -5302,6 +5303,7 @@ vectorizable_store (gimple stmt, gimple_
>             ltype = vectype;
>           ltype = build_aligned_type (ltype, TYPE_ALIGN (elem_type));
>           ncopies = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
> +         group_size = 1;
>         }
>
>        ivstep = stride_step;
> @@ -5322,65 +5324,89 @@ vectorizable_store (gimple stmt, gimple_
>         gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
>
>        prev_stmt_info = NULL;
> -      running_off = offvar;
> -      alias_off = build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0);
> -      for (j = 0; j < ncopies; j++)
> +      alias_off = build_int_cst (reference_alias_ptr_type (DR_REF 
> (first_dr)), 0);
> +      next_stmt = first_stmt;
> +      for (g = 0; g < group_size; g++)
>         {
> -         /* We've set op and dt above, from gimple_assign_rhs1(stmt),
> -            and first_stmt == stmt.  */
> -         if (j == 0)
> -           {
> -             if (slp)
> -               {
> -                 vect_get_vec_defs (op, NULL_TREE, stmt, &vec_oprnds, NULL,
> -                                    slp_node, -1);
> -                 vec_oprnd = vec_oprnds[0];
> -               }
> -             else
> -               vec_oprnd = vect_get_vec_def_for_operand (op, first_stmt, 
> NULL);
> -           }
> -         else
> +         running_off = offvar;
> +         if (g)
>             {
> -             if (slp)
> -               vec_oprnd = vec_oprnds[j];
> -             else
> -               vec_oprnd = vect_get_vec_def_for_stmt_copy (dt, vec_oprnd);
> -           }
> -
> -         for (i = 0; i < nstores; i++)
> -           {
> -             tree newref, newoff;
> -             gimple incr, assign;
> -             tree size = TYPE_SIZE (ltype);
> -             /* Extract the i'th component.  */
> -             tree pos = fold_build2 (MULT_EXPR, bitsizetype, bitsize_int (i),
> +             tree size = TYPE_SIZE_UNIT (ltype);
> +             tree pos = fold_build2 (MULT_EXPR, sizetype, size_int (g),
>                                       size);
> -             tree elem = fold_build3 (BIT_FIELD_REF, ltype, vec_oprnd,
> -                                      size, pos);
> -
> -             elem = force_gimple_operand_gsi (gsi, elem, true,
> -                                              NULL_TREE, true,
> -                                              GSI_SAME_STMT);
> -
> -             newref = build2 (MEM_REF, ltype,
> -                              running_off, alias_off);
> -
> -             /* And store it to *running_off.  */
> -             assign = gimple_build_assign (newref, elem);
> -             vect_finish_stmt_generation (stmt, assign, gsi);
> -
> -             newoff = copy_ssa_name (running_off, NULL);
> +             tree newoff = copy_ssa_name (running_off, NULL);
>               incr = gimple_build_assign (newoff, POINTER_PLUS_EXPR,
> -                                         running_off, stride_step);
> +                                         running_off, pos);
>               vect_finish_stmt_generation (stmt, incr, gsi);
> -
>               running_off = newoff;
> -             if (j == 0 && i == 0)
> -               STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = assign;
> +           }
> +         for (j = 0; j < ncopies; j++)
> +           {
> +             /* We've set op and dt above, from gimple_assign_rhs1(stmt),
> +                and first_stmt == stmt.  */
> +             if (j == 0)
> +               {
> +                 if (slp)
> +                   {
> +                     vect_get_vec_defs (op, NULL_TREE, stmt, &vec_oprnds, 
> NULL,
> +                                        slp_node, -1);
> +                     vec_oprnd = vec_oprnds[0];
> +                   }
> +                 else
> +                   {
> +                     gcc_assert (gimple_assign_single_p (next_stmt));
> +                     op = gimple_assign_rhs1 (next_stmt);
> +                     vec_oprnd = vect_get_vec_def_for_operand (op, next_stmt,
> +                                                               NULL);
> +                   }
> +               }
>               else
> -               STMT_VINFO_RELATED_STMT (prev_stmt_info) = assign;
> -             prev_stmt_info = vinfo_for_stmt (assign);
> +               {
> +                 if (slp)
> +                   vec_oprnd = vec_oprnds[j];
> +                 else
> +                   vec_oprnd = vect_get_vec_def_for_stmt_copy (dt, 
> vec_oprnd);
> +               }
> +
> +             for (i = 0; i < nstores; i++)
> +               {
> +                 tree newref, newoff;
> +                 gimple incr, assign;
> +                 tree size = TYPE_SIZE (ltype);
> +                 /* Extract the i'th component.  */
> +                 tree pos = fold_build2 (MULT_EXPR, bitsizetype,
> +                                         bitsize_int (i), size);
> +                 tree elem = fold_build3 (BIT_FIELD_REF, ltype, vec_oprnd,
> +                                          size, pos);
> +
> +                 elem = force_gimple_operand_gsi (gsi, elem, true,
> +                                                  NULL_TREE, true,
> +                                                  GSI_SAME_STMT);
> +
> +                 newref = build2 (MEM_REF, ltype,
> +                                  running_off, alias_off);
> +
> +                 /* And store it to *running_off.  */
> +                 assign = gimple_build_assign (newref, elem);
> +                 vect_finish_stmt_generation (stmt, assign, gsi);
> +
> +                 newoff = copy_ssa_name (running_off, NULL);
> +                 incr = gimple_build_assign (newoff, POINTER_PLUS_EXPR,
> +                                             running_off, stride_step);
> +                 vect_finish_stmt_generation (stmt, incr, gsi);
> +
> +                 running_off = newoff;
> +                 if (g == group_size - 1)
> +                   {
> +                     if (j == 0 && i == 0)
> +                       STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = assign;
> +                     else
> +                       STMT_VINFO_RELATED_STMT (prev_stmt_info) = assign;
> +                     prev_stmt_info = vinfo_for_stmt (assign);
> +                   }
> +               }
>             }
> +         next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
>         }
>        return true;
>      }
> @@ -6265,7 +6291,7 @@ vectorizable_load (gimple stmt, gimple_s
>
>        gcc_assert (!nested_in_vect_loop);
>
> -      if (grouped_load)
> +      if (slp && grouped_load)
>         first_dr = STMT_VINFO_DATA_REF
>             (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
>        else
> Index: testsuite/gcc.dg/vect/pr66253.c
> ===================================================================
> --- testsuite/gcc.dg/vect/pr66253.c     (revision 0)
> +++ testsuite/gcc.dg/vect/pr66253.c     (working copy)
> @@ -0,0 +1,51 @@
> +/* { dg-require-effective-target vect_double } */
> +/* { dg-require-effective-target vect_hw_misalign } */
> +
> +#include "tree-vect.h"
> +
> +void __attribute__((noinline,noclone))
> +test1(_Complex double * __restrict__ a, _Complex double * __restrict__ b,
> +      double * __restrict__ c, int stride, int n)
> +{
> +  int i;
> +  for (i = 0; i < n; i++)
> +    {
> +      a[i*stride] = 0.5 * b[i*stride] * c[i*stride];
> +    }
> +}
> +
> +double ca[256];
> +_Complex double ia[256];
> +_Complex double da[256];
> +
> +extern void abort (void);
> +
> +int main ()
> +{
> +  int i;
> +  int stride;
> +
> +  check_vect ();
> +
> +  for (stride = 1; stride < 15; stride++)
> +    {
> +      for (i = 0; i < 256; i++)
> +       {
> +         __real__ ia[i] = (i + stride) % 19;
> +         __imag__ ia[i] = (i + stride) % 23;
> +         ca[i] = (i + stride) % 29;
> +         __asm__ volatile ("");
> +       }
> +
> +      test1(da, ia, ca, stride, 256/stride);
> +
> +      for (i = 0; i < 256/stride; i++)
> +       {
> +         if (da[i*stride] != 0.5 * ia[i*stride] * ca[i*stride])
> +           abort ();
> +       }
> +    }
> +  return 0;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */

Reply via email to