On Mon, Aug 14, 2023 at 2:49 PM Kewen.Lin <li...@linux.ibm.com> wrote:
>
> Hi Richi,
>
> on 2023/8/14 20:04, Richard Biener wrote:
> > On Mon, Aug 14, 2023 at 10:54 AM Kewen.Lin <li...@linux.ibm.com> wrote:
> >>
> >> Hi,
> >>
> >> Following Richi's suggestion [1], this patch is to move the
> >> handlings on VMAT_LOAD_STORE_LANES in the final loop nest
> >> of function vectorizable_load to its own loop.  Basically
> >> it duplicates the final loop nest, clean up some useless
> >> set up code for the case of VMAT_LOAD_STORE_LANES, remove
> >> some unreachable code.  Also remove the corresponding
> >> handlings in the final loop nest.
> >>
> >> Bootstrapped and regtested on x86_64-redhat-linux,
> >> aarch64-linux-gnu and powerpc64{,le}-linux-gnu.
> >
> > OK (I guess the big diff is mostly because of re-indenting).
>
> Thanks!  Yes, there is some code in the original final loop nest like
>
> if (memory_access_type == VMAT_LOAD_STORE_LANES)
>   {
>     ...
>   }
> else
>   {
>     ...
>   }
>
> Then the else arm is fully re-indented.
>
> The other patch on VMAT_GATHER_SCATTER looks a bit better since
> it doesn't need re-indenting.

Yes, that's also because VMAT_LOAD_STORE_LANES isn't for SLP so
it even makes more sense to split that case out.

Richard.

> BR,
> Kewen
>
> >
> > Thanks,
> > Richard.
> >
> >> [1] https://gcc.gnu.org/pipermail/gcc-patches/2023-June/623329.html
> >>
> >> gcc/ChangeLog:
> >>
> >>         * tree-vect-stmts.cc (vectorizable_load): Move the handlings on
> >>         VMAT_LOAD_STORE_LANES in the final loop nest to its own loop,
> >>         and update the final nest accordingly.
> >> ---
> >>  gcc/tree-vect-stmts.cc | 1275 ++++++++++++++++++++--------------------
> >>  1 file changed, 634 insertions(+), 641 deletions(-)
> >>
> >> diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> >> index 4f2d088484c..c361e16cb7b 100644
> >> --- a/gcc/tree-vect-stmts.cc
> >> +++ b/gcc/tree-vect-stmts.cc
> >> @@ -10332,7 +10332,129 @@ vectorizable_load (vec_info *vinfo,
> >>         vect_get_vec_defs_for_operand (vinfo, stmt_info, ncopies, mask,
> >>                                        &vec_masks, mask_vectype);
> >>      }
> >> +
> >>    tree vec_mask = NULL_TREE;
> >> +  if (memory_access_type == VMAT_LOAD_STORE_LANES)
> >> +    {
> >> +      gcc_assert (alignment_support_scheme == dr_aligned
> >> +                 || alignment_support_scheme == dr_unaligned_supported);
> >> +      gcc_assert (grouped_load && !slp);
> >> +
> >> +      unsigned int inside_cost = 0, prologue_cost = 0;
> >> +      for (j = 0; j < ncopies; j++)
> >> +       {
> >> +         if (costing_p)
> >> +           {
> >> +             /* An IFN_LOAD_LANES will load all its vector results,
> >> +                regardless of which ones we actually need.  Account
> >> +                for the cost of unused results.  */
> >> +             if (first_stmt_info == stmt_info)
> >> +               {
> >> +                 unsigned int gaps = DR_GROUP_SIZE (first_stmt_info);
> >> +                 stmt_vec_info next_stmt_info = first_stmt_info;
> >> +                 do
> >> +                   {
> >> +                     gaps -= 1;
> >> +                     next_stmt_info = DR_GROUP_NEXT_ELEMENT 
> >> (next_stmt_info);
> >> +                   }
> >> +                 while (next_stmt_info);
> >> +                 if (gaps)
> >> +                   {
> >> +                     if (dump_enabled_p ())
> >> +                       dump_printf_loc (MSG_NOTE, vect_location,
> >> +                                        "vect_model_load_cost: %d "
> >> +                                        "unused vectors.\n",
> >> +                                        gaps);
> >> +                     vect_get_load_cost (vinfo, stmt_info, gaps,
> >> +                                         alignment_support_scheme,
> >> +                                         misalignment, false, 
> >> &inside_cost,
> >> +                                         &prologue_cost, cost_vec, 
> >> cost_vec,
> >> +                                         true);
> >> +                   }
> >> +               }
> >> +             vect_get_load_cost (vinfo, stmt_info, 1, 
> >> alignment_support_scheme,
> >> +                                 misalignment, false, &inside_cost,
> >> +                                 &prologue_cost, cost_vec, cost_vec, 
> >> true);
> >> +             continue;
> >> +           }
> >> +
> >> +         /* 1. Create the vector or array pointer update chain.  */
> >> +         if (j == 0)
> >> +           dataref_ptr
> >> +             = vect_create_data_ref_ptr (vinfo, first_stmt_info, 
> >> aggr_type,
> >> +                                         at_loop, offset, &dummy, gsi,
> >> +                                         &ptr_incr, false, bump);
> >> +         else
> >> +           {
> >> +             gcc_assert (!LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo));
> >> +             dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr, 
> >> gsi,
> >> +                                            stmt_info, bump);
> >> +           }
> >> +         if (mask)
> >> +           vec_mask = vec_masks[j];
> >> +
> >> +         tree vec_array = create_vector_array (vectype, vec_num);
> >> +
> >> +         tree final_mask = NULL_TREE;
> >> +         if (loop_masks)
> >> +           final_mask = vect_get_loop_mask (loop_vinfo, gsi, loop_masks,
> >> +                                            ncopies, vectype, j);
> >> +         if (vec_mask)
> >> +           final_mask = prepare_vec_mask (loop_vinfo, mask_vectype, 
> >> final_mask,
> >> +                                          vec_mask, gsi);
> >> +
> >> +         gcall *call;
> >> +         if (final_mask)
> >> +           {
> >> +             /* Emit:
> >> +                  VEC_ARRAY = MASK_LOAD_LANES (DATAREF_PTR, ALIAS_PTR,
> >> +                                               VEC_MASK).  */
> >> +             unsigned int align = TYPE_ALIGN (TREE_TYPE (vectype));
> >> +             tree alias_ptr = build_int_cst (ref_type, align);
> >> +             call = gimple_build_call_internal (IFN_MASK_LOAD_LANES, 3,
> >> +                                                dataref_ptr, alias_ptr,
> >> +                                                final_mask);
> >> +           }
> >> +         else
> >> +           {
> >> +             /* Emit:
> >> +                  VEC_ARRAY = LOAD_LANES (MEM_REF[...all elements...]).  
> >> */
> >> +             data_ref = create_array_ref (aggr_type, dataref_ptr, 
> >> ref_type);
> >> +             call = gimple_build_call_internal (IFN_LOAD_LANES, 1, 
> >> data_ref);
> >> +           }
> >> +         gimple_call_set_lhs (call, vec_array);
> >> +         gimple_call_set_nothrow (call, true);
> >> +         vect_finish_stmt_generation (vinfo, stmt_info, call, gsi);
> >> +
> >> +         dr_chain.create (vec_num);
> >> +         /* Extract each vector into an SSA_NAME.  */
> >> +         for (i = 0; i < vec_num; i++)
> >> +           {
> >> +             new_temp = read_vector_array (vinfo, stmt_info, gsi, 
> >> scalar_dest,
> >> +                                           vec_array, i);
> >> +             dr_chain.quick_push (new_temp);
> >> +           }
> >> +
> >> +         /* Record the mapping between SSA_NAMEs and statements.  */
> >> +         vect_record_grouped_load_vectors (vinfo, stmt_info, dr_chain);
> >> +
> >> +         /* Record that VEC_ARRAY is now dead.  */
> >> +         vect_clobber_variable (vinfo, stmt_info, gsi, vec_array);
> >> +
> >> +         dr_chain.release ();
> >> +
> >> +         *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
> >> +       }
> >> +
> >> +      if (costing_p && dump_enabled_p ())
> >> +       dump_printf_loc (MSG_NOTE, vect_location,
> >> +                        "vect_model_load_cost: inside_cost = %u, "
> >> +                        "prologue_cost = %u .\n",
> >> +                        inside_cost, prologue_cost);
> >> +
> >> +      return true;
> >> +    }
> >> +
> >>    poly_uint64 group_elt = 0;
> >>    unsigned int inside_cost = 0, prologue_cost = 0;
> >>    for (j = 0; j < ncopies; j++)
> >> @@ -10414,685 +10538,558 @@ vectorizable_load (vec_info *vinfo,
> >>         dr_chain.create (vec_num);
> >>
> >>        gimple *new_stmt = NULL;
> >> -      if (memory_access_type == VMAT_LOAD_STORE_LANES)
> >> +      for (i = 0; i < vec_num; i++)
> >>         {
> >> -         if (costing_p)
> >> -           {
> >> -             /* An IFN_LOAD_LANES will load all its vector results,
> >> -                regardless of which ones we actually need.  Account
> >> -                for the cost of unused results.  */
> >> -             if (grouped_load && first_stmt_info == stmt_info)
> >> -               {
> >> -                 unsigned int gaps = DR_GROUP_SIZE (first_stmt_info);
> >> -                 stmt_vec_info next_stmt_info = first_stmt_info;
> >> -                 do
> >> -                   {
> >> -                     gaps -= 1;
> >> -                     next_stmt_info = DR_GROUP_NEXT_ELEMENT 
> >> (next_stmt_info);
> >> -                   }
> >> -                 while (next_stmt_info);
> >> -                 if (gaps)
> >> -                   {
> >> -                     if (dump_enabled_p ())
> >> -                       dump_printf_loc (MSG_NOTE, vect_location,
> >> -                                        "vect_model_load_cost: %d "
> >> -                                        "unused vectors.\n",
> >> -                                        gaps);
> >> -                     vect_get_load_cost (vinfo, stmt_info, gaps,
> >> -                                         alignment_support_scheme,
> >> -                                         misalignment, false, 
> >> &inside_cost,
> >> -                                         &prologue_cost, cost_vec, 
> >> cost_vec,
> >> -                                         true);
> >> -                   }
> >> -               }
> >> -             vect_get_load_cost (vinfo, stmt_info, 1, 
> >> alignment_support_scheme,
> >> -                                 misalignment, false, &inside_cost,
> >> -                                 &prologue_cost, cost_vec, cost_vec, 
> >> true);
> >> -             continue;
> >> -           }
> >> -         tree vec_array;
> >> -
> >> -         vec_array = create_vector_array (vectype, vec_num);
> >> -
> >>           tree final_mask = NULL_TREE;
> >> -         if (loop_masks)
> >> -           final_mask = vect_get_loop_mask (loop_vinfo, gsi, loop_masks,
> >> -                                            ncopies, vectype, j);
> >> -         if (vec_mask)
> >> -           final_mask = prepare_vec_mask (loop_vinfo, mask_vectype,
> >> -                                          final_mask, vec_mask, gsi);
> >> -
> >> -         gcall *call;
> >> -         if (final_mask)
> >> -           {
> >> -             /* Emit:
> >> -                  VEC_ARRAY = MASK_LOAD_LANES (DATAREF_PTR, ALIAS_PTR,
> >> -                                               VEC_MASK).  */
> >> -             unsigned int align = TYPE_ALIGN (TREE_TYPE (vectype));
> >> -             tree alias_ptr = build_int_cst (ref_type, align);
> >> -             call = gimple_build_call_internal (IFN_MASK_LOAD_LANES, 3,
> >> -                                                dataref_ptr, alias_ptr,
> >> -                                                final_mask);
> >> -           }
> >> -         else
> >> +         tree final_len = NULL_TREE;
> >> +         tree bias = NULL_TREE;
> >> +         if (!costing_p)
> >>             {
> >> -             /* Emit:
> >> -                  VEC_ARRAY = LOAD_LANES (MEM_REF[...all elements...]).  
> >> */
> >> -             data_ref = create_array_ref (aggr_type, dataref_ptr, 
> >> ref_type);
> >> -             call = gimple_build_call_internal (IFN_LOAD_LANES, 1, 
> >> data_ref);
> >> -           }
> >> -         gimple_call_set_lhs (call, vec_array);
> >> -         gimple_call_set_nothrow (call, true);
> >> -         vect_finish_stmt_generation (vinfo, stmt_info, call, gsi);
> >> -         new_stmt = call;
> >> +             if (loop_masks)
> >> +               final_mask = vect_get_loop_mask (loop_vinfo, gsi, 
> >> loop_masks,
> >> +                                                vec_num * ncopies, 
> >> vectype,
> >> +                                                vec_num * j + i);
> >> +             if (vec_mask)
> >> +               final_mask = prepare_vec_mask (loop_vinfo, mask_vectype,
> >> +                                              final_mask, vec_mask, gsi);
> >>
> >> -         /* Extract each vector into an SSA_NAME.  */
> >> -         for (i = 0; i < vec_num; i++)
> >> -           {
> >> -             new_temp = read_vector_array (vinfo, stmt_info, gsi, 
> >> scalar_dest,
> >> -                                           vec_array, i);
> >> -             dr_chain.quick_push (new_temp);
> >> +             if (i > 0 && !STMT_VINFO_GATHER_SCATTER_P (stmt_info))
> >> +               dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, 
> >> ptr_incr,
> >> +                                              gsi, stmt_info, bump);
> >>             }
> >>
> >> -         /* Record the mapping between SSA_NAMEs and statements.  */
> >> -         vect_record_grouped_load_vectors (vinfo, stmt_info, dr_chain);
> >> -
> >> -         /* Record that VEC_ARRAY is now dead.  */
> >> -         vect_clobber_variable (vinfo, stmt_info, gsi, vec_array);
> >> -       }
> >> -      else
> >> -       {
> >> -         for (i = 0; i < vec_num; i++)
> >> +         /* 2. Create the vector-load in the loop.  */
> >> +         switch (alignment_support_scheme)
> >>             {
> >> -             tree final_mask = NULL_TREE;
> >> -             tree final_len = NULL_TREE;
> >> -             tree bias = NULL_TREE;
> >> -             if (!costing_p)
> >> -               {
> >> -                 if (loop_masks)
> >> -                   final_mask
> >> -                     = vect_get_loop_mask (loop_vinfo, gsi, loop_masks,
> >> -                                           vec_num * ncopies, vectype,
> >> -                                           vec_num * j + i);
> >> -                 if (vec_mask)
> >> -                   final_mask = prepare_vec_mask (loop_vinfo, 
> >> mask_vectype,
> >> -                                                  final_mask, vec_mask, 
> >> gsi);
> >> -
> >> -                 if (i > 0 && !STMT_VINFO_GATHER_SCATTER_P (stmt_info))
> >> -                   dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, 
> >> ptr_incr,
> >> -                                                  gsi, stmt_info, bump);
> >> -               }
> >> +           case dr_aligned:
> >> +           case dr_unaligned_supported:
> >> +             {
> >> +               unsigned int misalign;
> >> +               unsigned HOST_WIDE_INT align;
> >>
> >> -             /* 2. Create the vector-load in the loop.  */
> >> -             switch (alignment_support_scheme)
> >> -               {
> >> -               case dr_aligned:
> >> -               case dr_unaligned_supported:
> >> +               if (memory_access_type == VMAT_GATHER_SCATTER
> >> +                   && gs_info.ifn != IFN_LAST)
> >>                   {
> >> -                   unsigned int misalign;
> >> -                   unsigned HOST_WIDE_INT align;
> >> -
> >> -                   if (memory_access_type == VMAT_GATHER_SCATTER
> >> -                       && gs_info.ifn != IFN_LAST)
> >> +                   if (costing_p)
> >>                       {
> >> -                       if (costing_p)
> >> -                         {
> >> -                           unsigned int cnunits
> >> -                             = vect_nunits_for_cost (vectype);
> >> -                           inside_cost
> >> -                             = record_stmt_cost (cost_vec, cnunits,
> >> -                                                 scalar_load, stmt_info, 
> >> 0,
> >> -                                                 vect_body);
> >> -                           break;
> >> -                         }
> >> -                       if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
> >> -                         vec_offset = vec_offsets[vec_num * j + i];
> >> -                       tree zero = build_zero_cst (vectype);
> >> -                       tree scale = size_int (gs_info.scale);
> >> -
> >> -                       if (gs_info.ifn == IFN_MASK_LEN_GATHER_LOAD)
> >> -                         {
> >> -                           if (loop_lens)
> >> -                             final_len
> >> -                               = vect_get_loop_len (loop_vinfo, gsi, 
> >> loop_lens,
> >> -                                                    vec_num * ncopies, 
> >> vectype,
> >> -                                                    vec_num * j + i, 1);
> >> -                           else
> >> -                             final_len = build_int_cst (sizetype,
> >> -                                                        
> >> TYPE_VECTOR_SUBPARTS (
> >> -                                                          vectype));
> >> -                           signed char biasval
> >> -                             = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS 
> >> (loop_vinfo);
> >> -                           bias = build_int_cst (intQI_type_node, 
> >> biasval);
> >> -                           if (!final_mask)
> >> -                             {
> >> -                               mask_vectype = truth_type_for (vectype);
> >> -                               final_mask = build_minus_one_cst 
> >> (mask_vectype);
> >> -                             }
> >> -                         }
> >> -
> >> -                       gcall *call;
> >> -                       if (final_len && final_mask)
> >> -                         call = gimple_build_call_internal (
> >> -                           IFN_MASK_LEN_GATHER_LOAD, 7, dataref_ptr,
> >> -                           vec_offset, scale, zero, final_mask, final_len,
> >> -                           bias);
> >> -                       else if (final_mask)
> >> -                         call = gimple_build_call_internal
> >> -                           (IFN_MASK_GATHER_LOAD, 5, dataref_ptr,
> >> -                            vec_offset, scale, zero, final_mask);
> >> -                       else
> >> -                         call = gimple_build_call_internal
> >> -                           (IFN_GATHER_LOAD, 4, dataref_ptr,
> >> -                            vec_offset, scale, zero);
> >> -                       gimple_call_set_nothrow (call, true);
> >> -                       new_stmt = call;
> >> -                       data_ref = NULL_TREE;
> >> +                       unsigned int cnunits = vect_nunits_for_cost 
> >> (vectype);
> >> +                       inside_cost
> >> +                         = record_stmt_cost (cost_vec, cnunits, 
> >> scalar_load,
> >> +                                             stmt_info, 0, vect_body);
> >>                         break;
> >>                       }
> >> -                   else if (memory_access_type == VMAT_GATHER_SCATTER)
> >> +                   if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
> >> +                     vec_offset = vec_offsets[vec_num * j + i];
> >> +                   tree zero = build_zero_cst (vectype);
> >> +                   tree scale = size_int (gs_info.scale);
> >> +
> >> +                   if (gs_info.ifn == IFN_MASK_LEN_GATHER_LOAD)
> >>                       {
> >> -                       /* Emulated gather-scatter.  */
> >> -                       gcc_assert (!final_mask);
> >> -                       unsigned HOST_WIDE_INT const_nunits
> >> -                         = nunits.to_constant ();
> >> -                       if (costing_p)
> >> -                         {
> >> -                           /* For emulated gathers N offset vector element
> >> -                              offset add is consumed by the load).  */
> >> -                           inside_cost
> >> -                             = record_stmt_cost (cost_vec, const_nunits,
> >> -                                                 vec_to_scalar, 
> >> stmt_info, 0,
> >> -                                                 vect_body);
> >> -                           /* N scalar loads plus gathering them into a
> >> -                              vector.  */
> >> -                           inside_cost
> >> -                             = record_stmt_cost (cost_vec, const_nunits,
> >> -                                                 scalar_load, stmt_info, 
> >> 0,
> >> -                                                 vect_body);
> >> -                           inside_cost
> >> -                             = record_stmt_cost (cost_vec, 1, 
> >> vec_construct,
> >> -                                                 stmt_info, 0, vect_body);
> >> -                           break;
> >> -                         }
> >> -                       unsigned HOST_WIDE_INT const_offset_nunits
> >> -                         = TYPE_VECTOR_SUBPARTS (gs_info.offset_vectype)
> >> -                             .to_constant ();
> >> -                       vec<constructor_elt, va_gc> *ctor_elts;
> >> -                       vec_alloc (ctor_elts, const_nunits);
> >> -                       gimple_seq stmts = NULL;
> >> -                       /* We support offset vectors with more elements
> >> -                          than the data vector for now.  */
> >> -                       unsigned HOST_WIDE_INT factor
> >> -                         = const_offset_nunits / const_nunits;
> >> -                       vec_offset = vec_offsets[j / factor];
> >> -                       unsigned elt_offset = (j % factor) * const_nunits;
> >> -                       tree idx_type = TREE_TYPE (TREE_TYPE (vec_offset));
> >> -                       tree scale = size_int (gs_info.scale);
> >> -                       align
> >> -                         = get_object_alignment (DR_REF 
> >> (first_dr_info->dr));
> >> -                       tree ltype = build_aligned_type (TREE_TYPE 
> >> (vectype),
> >> -                                                        align);
> >> -                       for (unsigned k = 0; k < const_nunits; ++k)
> >> +                       if (loop_lens)
> >> +                         final_len
> >> +                           = vect_get_loop_len (loop_vinfo, gsi, 
> >> loop_lens,
> >> +                                                vec_num * ncopies, 
> >> vectype,
> >> +                                                vec_num * j + i, 1);
> >> +                       else
> >> +                         final_len
> >> +                           = build_int_cst (sizetype,
> >> +                                            TYPE_VECTOR_SUBPARTS 
> >> (vectype));
> >> +                       signed char biasval
> >> +                         = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS 
> >> (loop_vinfo);
> >> +                       bias = build_int_cst (intQI_type_node, biasval);
> >> +                       if (!final_mask)
> >>                           {
> >> -                           tree boff = size_binop (MULT_EXPR,
> >> -                                                   TYPE_SIZE (idx_type),
> >> -                                                   bitsize_int
> >> -                                                     (k + elt_offset));
> >> -                           tree idx = gimple_build (&stmts, BIT_FIELD_REF,
> >> -                                                    idx_type, vec_offset,
> >> -                                                    TYPE_SIZE (idx_type),
> >> -                                                    boff);
> >> -                           idx = gimple_convert (&stmts, sizetype, idx);
> >> -                           idx = gimple_build (&stmts, MULT_EXPR,
> >> -                                               sizetype, idx, scale);
> >> -                           tree ptr = gimple_build (&stmts, PLUS_EXPR,
> >> -                                                    TREE_TYPE 
> >> (dataref_ptr),
> >> -                                                    dataref_ptr, idx);
> >> -                           ptr = gimple_convert (&stmts, ptr_type_node, 
> >> ptr);
> >> -                           tree elt = make_ssa_name (TREE_TYPE (vectype));
> >> -                           tree ref = build2 (MEM_REF, ltype, ptr,
> >> -                                              build_int_cst (ref_type, 
> >> 0));
> >> -                           new_stmt = gimple_build_assign (elt, ref);
> >> -                           gimple_set_vuse (new_stmt,
> >> -                                            gimple_vuse (gsi_stmt 
> >> (*gsi)));
> >> -                           gimple_seq_add_stmt (&stmts, new_stmt);
> >> -                           CONSTRUCTOR_APPEND_ELT (ctor_elts, NULL_TREE, 
> >> elt);
> >> +                           mask_vectype = truth_type_for (vectype);
> >> +                           final_mask = build_minus_one_cst 
> >> (mask_vectype);
> >>                           }
> >> -                       gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
> >> -                       new_stmt = gimple_build_assign (NULL_TREE,
> >> -                                                       build_constructor
> >> -                                                         (vectype, 
> >> ctor_elts));
> >> -                       data_ref = NULL_TREE;
> >> -                       break;
> >>                       }
> >>
> >> -                   if (costing_p)
> >> -                     break;
> >> -
> >> -                   align =
> >> -                     known_alignment (DR_TARGET_ALIGNMENT 
> >> (first_dr_info));
> >> -                   if (alignment_support_scheme == dr_aligned)
> >> -                     misalign = 0;
> >> -                   else if (misalignment == DR_MISALIGNMENT_UNKNOWN)
> >> -                     {
> >> -                       align = dr_alignment
> >> -                         (vect_dr_behavior (vinfo, first_dr_info));
> >> -                       misalign = 0;
> >> -                     }
> >> +                   gcall *call;
> >> +                   if (final_len && final_mask)
> >> +                     call = gimple_build_call_internal (
> >> +                       IFN_MASK_LEN_GATHER_LOAD, 7, dataref_ptr, 
> >> vec_offset,
> >> +                       scale, zero, final_mask, final_len, bias);
> >> +                   else if (final_mask)
> >> +                     call
> >> +                       = gimple_build_call_internal 
> >> (IFN_MASK_GATHER_LOAD, 5,
> >> +                                                     dataref_ptr, 
> >> vec_offset,
> >> +                                                     scale, zero, 
> >> final_mask);
> >>                     else
> >> -                     misalign = misalignment;
> >> -                   if (dataref_offset == NULL_TREE
> >> -                       && TREE_CODE (dataref_ptr) == SSA_NAME)
> >> -                     set_ptr_info_alignment (get_ptr_info (dataref_ptr),
> >> -                                             align, misalign);
> >> -                   align = least_bit_hwi (misalign | align);
> >> -
> >> -                   /* Compute IFN when LOOP_LENS or final_mask valid.  */
> >> -                   machine_mode vmode = TYPE_MODE (vectype);
> >> -                   machine_mode new_vmode = vmode;
> >> -                   internal_fn partial_ifn = IFN_LAST;
> >> -                   if (loop_lens)
> >> +                     call
> >> +                       = gimple_build_call_internal (IFN_GATHER_LOAD, 4,
> >> +                                                     dataref_ptr, 
> >> vec_offset,
> >> +                                                     scale, zero);
> >> +                   gimple_call_set_nothrow (call, true);
> >> +                   new_stmt = call;
> >> +                   data_ref = NULL_TREE;
> >> +                   break;
> >> +                 }
> >> +               else if (memory_access_type == VMAT_GATHER_SCATTER)
> >> +                 {
> >> +                   /* Emulated gather-scatter.  */
> >> +                   gcc_assert (!final_mask);
> >> +                   unsigned HOST_WIDE_INT const_nunits = 
> >> nunits.to_constant ();
> >> +                   if (costing_p)
> >>                       {
> >> -                       opt_machine_mode new_ovmode
> >> -                         = get_len_load_store_mode (vmode, true,
> >> -                                                    &partial_ifn);
> >> -                       new_vmode = new_ovmode.require ();
> >> -                       unsigned factor = (new_ovmode == vmode)
> >> -                                           ? 1
> >> -                                           : GET_MODE_UNIT_SIZE (vmode);
> >> -                       final_len
> >> -                         = vect_get_loop_len (loop_vinfo, gsi, loop_lens,
> >> -                                              vec_num * ncopies, vectype,
> >> -                                              vec_num * j + i, factor);
> >> +                       /* For emulated gathers N offset vector element
> >> +                          offset add is consumed by the load).  */
> >> +                       inside_cost
> >> +                         = record_stmt_cost (cost_vec, const_nunits,
> >> +                                             vec_to_scalar, stmt_info, 0,
> >> +                                             vect_body);
> >> +                       /* N scalar loads plus gathering them into a
> >> +                          vector.  */
> >> +                       inside_cost = record_stmt_cost (cost_vec, 
> >> const_nunits,
> >> +                                                       scalar_load, 
> >> stmt_info,
> >> +                                                       0, vect_body);
> >> +                       inside_cost
> >> +                         = record_stmt_cost (cost_vec, 1, vec_construct,
> >> +                                             stmt_info, 0, vect_body);
> >> +                       break;
> >>                       }
> >> -                   else if (final_mask)
> >> +                   unsigned HOST_WIDE_INT const_offset_nunits
> >> +                     = TYPE_VECTOR_SUBPARTS (gs_info.offset_vectype)
> >> +                         .to_constant ();
> >> +                   vec<constructor_elt, va_gc> *ctor_elts;
> >> +                   vec_alloc (ctor_elts, const_nunits);
> >> +                   gimple_seq stmts = NULL;
> >> +                   /* We support offset vectors with more elements
> >> +                      than the data vector for now.  */
> >> +                   unsigned HOST_WIDE_INT factor
> >> +                     = const_offset_nunits / const_nunits;
> >> +                   vec_offset = vec_offsets[j / factor];
> >> +                   unsigned elt_offset = (j % factor) * const_nunits;
> >> +                   tree idx_type = TREE_TYPE (TREE_TYPE (vec_offset));
> >> +                   tree scale = size_int (gs_info.scale);
> >> +                   align = get_object_alignment (DR_REF 
> >> (first_dr_info->dr));
> >> +                   tree ltype
> >> +                     = build_aligned_type (TREE_TYPE (vectype), align);
> >> +                   for (unsigned k = 0; k < const_nunits; ++k)
> >>                       {
> >> -                       if (!can_vec_mask_load_store_p (
> >> -                             vmode, TYPE_MODE (TREE_TYPE (final_mask)), 
> >> true,
> >> -                             &partial_ifn))
> >> -                         gcc_unreachable ();
> >> +                       tree boff = size_binop (MULT_EXPR, TYPE_SIZE 
> >> (idx_type),
> >> +                                               bitsize_int (k + 
> >> elt_offset));
> >> +                       tree idx = gimple_build (&stmts, BIT_FIELD_REF,
> >> +                                                idx_type, vec_offset,
> >> +                                                TYPE_SIZE (idx_type), 
> >> boff);
> >> +                       idx = gimple_convert (&stmts, sizetype, idx);
> >> +                       idx = gimple_build (&stmts, MULT_EXPR, sizetype, 
> >> idx,
> >> +                                           scale);
> >> +                       tree ptr = gimple_build (&stmts, PLUS_EXPR,
> >> +                                                TREE_TYPE (dataref_ptr),
> >> +                                                dataref_ptr, idx);
> >> +                       ptr = gimple_convert (&stmts, ptr_type_node, ptr);
> >> +                       tree elt = make_ssa_name (TREE_TYPE (vectype));
> >> +                       tree ref = build2 (MEM_REF, ltype, ptr,
> >> +                                          build_int_cst (ref_type, 0));
> >> +                       new_stmt = gimple_build_assign (elt, ref);
> >> +                       gimple_set_vuse (new_stmt,
> >> +                                        gimple_vuse (gsi_stmt (*gsi)));
> >> +                       gimple_seq_add_stmt (&stmts, new_stmt);
> >> +                       CONSTRUCTOR_APPEND_ELT (ctor_elts, NULL_TREE, elt);
> >>                       }
> >> +                   gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
> >> +                   new_stmt = gimple_build_assign (
> >> +                     NULL_TREE, build_constructor (vectype, ctor_elts));
> >> +                   data_ref = NULL_TREE;
> >> +                   break;
> >> +                 }
> >>
> >> -                   if (partial_ifn == IFN_MASK_LEN_LOAD)
> >> +               if (costing_p)
> >> +                 break;
> >> +
> >> +               align = known_alignment (DR_TARGET_ALIGNMENT 
> >> (first_dr_info));
> >> +               if (alignment_support_scheme == dr_aligned)
> >> +                 misalign = 0;
> >> +               else if (misalignment == DR_MISALIGNMENT_UNKNOWN)
> >> +                 {
> >> +                   align
> >> +                     = dr_alignment (vect_dr_behavior (vinfo, 
> >> first_dr_info));
> >> +                   misalign = 0;
> >> +                 }
> >> +               else
> >> +                 misalign = misalignment;
> >> +               if (dataref_offset == NULL_TREE
> >> +                   && TREE_CODE (dataref_ptr) == SSA_NAME)
> >> +                 set_ptr_info_alignment (get_ptr_info (dataref_ptr), 
> >> align,
> >> +                                         misalign);
> >> +               align = least_bit_hwi (misalign | align);
> >> +
> >> +               /* Compute IFN when LOOP_LENS or final_mask valid.  */
> >> +               machine_mode vmode = TYPE_MODE (vectype);
> >> +               machine_mode new_vmode = vmode;
> >> +               internal_fn partial_ifn = IFN_LAST;
> >> +               if (loop_lens)
> >> +                 {
> >> +                   opt_machine_mode new_ovmode
> >> +                     = get_len_load_store_mode (vmode, true, 
> >> &partial_ifn);
> >> +                   new_vmode = new_ovmode.require ();
> >> +                   unsigned factor
> >> +                     = (new_ovmode == vmode) ? 1 : GET_MODE_UNIT_SIZE 
> >> (vmode);
> >> +                   final_len = vect_get_loop_len (loop_vinfo, gsi, 
> >> loop_lens,
> >> +                                                  vec_num * ncopies, 
> >> vectype,
> >> +                                                  vec_num * j + i, 
> >> factor);
> >> +                 }
> >> +               else if (final_mask)
> >> +                 {
> >> +                   if (!can_vec_mask_load_store_p (
> >> +                         vmode, TYPE_MODE (TREE_TYPE (final_mask)), true,
> >> +                         &partial_ifn))
> >> +                     gcc_unreachable ();
> >> +                 }
> >> +
> >> +               if (partial_ifn == IFN_MASK_LEN_LOAD)
> >> +                 {
> >> +                   if (!final_len)
> >>                       {
> >> -                       if (!final_len)
> >> -                         {
> >> -                           /* Pass VF value to 'len' argument of
> >> -                              MASK_LEN_LOAD if LOOP_LENS is invalid.  */
> >> -                           final_len
> >> -                             = size_int (TYPE_VECTOR_SUBPARTS (vectype));
> >> -                         }
> >> -                       if (!final_mask)
> >> -                         {
> >> -                           /* Pass all ones value to 'mask' argument of
> >> -                              MASK_LEN_LOAD if final_mask is invalid.  */
> >> -                           mask_vectype = truth_type_for (vectype);
> >> -                           final_mask = build_minus_one_cst 
> >> (mask_vectype);
> >> -                         }
> >> +                       /* Pass VF value to 'len' argument of
> >> +                          MASK_LEN_LOAD if LOOP_LENS is invalid.  */
> >> +                       final_len = size_int (TYPE_VECTOR_SUBPARTS 
> >> (vectype));
> >>                       }
> >> -                   if (final_len)
> >> +                   if (!final_mask)
> >>                       {
> >> -                       signed char biasval
> >> -                         = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS 
> >> (loop_vinfo);
> >> -
> >> -                       bias = build_int_cst (intQI_type_node, biasval);
> >> +                       /* Pass all ones value to 'mask' argument of
> >> +                          MASK_LEN_LOAD if final_mask is invalid.  */
> >> +                       mask_vectype = truth_type_for (vectype);
> >> +                       final_mask = build_minus_one_cst (mask_vectype);
> >>                       }
> >> +                 }
> >> +               if (final_len)
> >> +                 {
> >> +                   signed char biasval
> >> +                     = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
> >>
> >> -                   if (final_len)
> >> +                   bias = build_int_cst (intQI_type_node, biasval);
> >> +                 }
> >> +
> >> +               if (final_len)
> >> +                 {
> >> +                   tree ptr = build_int_cst (ref_type, align * 
> >> BITS_PER_UNIT);
> >> +                   gcall *call;
> >> +                   if (partial_ifn == IFN_MASK_LEN_LOAD)
> >> +                     call = gimple_build_call_internal 
> >> (IFN_MASK_LEN_LOAD, 5,
> >> +                                                        dataref_ptr, ptr,
> >> +                                                        final_mask, 
> >> final_len,
> >> +                                                        bias);
> >> +                   else
> >> +                     call = gimple_build_call_internal (IFN_LEN_LOAD, 4,
> >> +                                                        dataref_ptr, ptr,
> >> +                                                        final_len, bias);
> >> +                   gimple_call_set_nothrow (call, true);
> >> +                   new_stmt = call;
> >> +                   data_ref = NULL_TREE;
> >> +
> >> +                   /* Need conversion if it's wrapped with VnQI.  */
> >> +                   if (vmode != new_vmode)
> >>                       {
> >> -                       tree ptr
> >> -                         = build_int_cst (ref_type, align * 
> >> BITS_PER_UNIT);
> >> -                       gcall *call;
> >> -                       if (partial_ifn == IFN_MASK_LEN_LOAD)
> >> -                         call = gimple_build_call_internal 
> >> (IFN_MASK_LEN_LOAD,
> >> -                                                            5, 
> >> dataref_ptr,
> >> -                                                            ptr, 
> >> final_mask,
> >> -                                                            final_len, 
> >> bias);
> >> -                       else
> >> -                         call = gimple_build_call_internal (IFN_LEN_LOAD, 
> >> 4,
> >> -                                                            dataref_ptr, 
> >> ptr,
> >> -                                                            final_len, 
> >> bias);
> >> -                       gimple_call_set_nothrow (call, true);
> >> -                       new_stmt = call;
> >> -                       data_ref = NULL_TREE;
> >> -
> >> -                       /* Need conversion if it's wrapped with VnQI.  */
> >> -                       if (vmode != new_vmode)
> >> -                         {
> >> -                           tree new_vtype = build_vector_type_for_mode (
> >> -                             unsigned_intQI_type_node, new_vmode);
> >> -                           tree var = vect_get_new_ssa_name (new_vtype,
> >> -                                                             
> >> vect_simple_var);
> >> -                           gimple_set_lhs (call, var);
> >> -                           vect_finish_stmt_generation (vinfo, stmt_info, 
> >> call,
> >> -                                                        gsi);
> >> -                           tree op = build1 (VIEW_CONVERT_EXPR, vectype, 
> >> var);
> >> -                           new_stmt
> >> -                             = gimple_build_assign (vec_dest,
> >> -                                                    VIEW_CONVERT_EXPR, 
> >> op);
> >> -                         }
> >> +                       tree new_vtype = build_vector_type_for_mode (
> >> +                         unsigned_intQI_type_node, new_vmode);
> >> +                       tree var
> >> +                         = vect_get_new_ssa_name (new_vtype, 
> >> vect_simple_var);
> >> +                       gimple_set_lhs (call, var);
> >> +                       vect_finish_stmt_generation (vinfo, stmt_info, 
> >> call,
> >> +                                                    gsi);
> >> +                       tree op = build1 (VIEW_CONVERT_EXPR, vectype, var);
> >> +                       new_stmt = gimple_build_assign (vec_dest,
> >> +                                                       VIEW_CONVERT_EXPR, 
> >> op);
> >>                       }
> >> -                   else if (final_mask)
> >> +                 }
> >> +               else if (final_mask)
> >> +                 {
> >> +                   tree ptr = build_int_cst (ref_type, align * 
> >> BITS_PER_UNIT);
> >> +                   gcall *call = gimple_build_call_internal 
> >> (IFN_MASK_LOAD, 3,
> >> +                                                             dataref_ptr, 
> >> ptr,
> >> +                                                             final_mask);
> >> +                   gimple_call_set_nothrow (call, true);
> >> +                   new_stmt = call;
> >> +                   data_ref = NULL_TREE;
> >> +                 }
> >> +               else
> >> +                 {
> >> +                   tree ltype = vectype;
> >> +                   tree new_vtype = NULL_TREE;
> >> +                   unsigned HOST_WIDE_INT gap = DR_GROUP_GAP 
> >> (first_stmt_info);
> >> +                   unsigned int vect_align
> >> +                     = vect_known_alignment_in_bytes (first_dr_info, 
> >> vectype);
> >> +                   unsigned int scalar_dr_size
> >> +                     = vect_get_scalar_dr_size (first_dr_info);
> >> +                   /* If there's no peeling for gaps but we have a gap
> >> +                      with slp loads then load the lower half of the
> >> +                      vector only.  See get_group_load_store_type for
> >> +                      when we apply this optimization.  */
> >> +                   if (slp
> >> +                       && loop_vinfo
> >> +                       && !LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && 
> >> gap != 0
> >> +                       && known_eq (nunits, (group_size - gap) * 2)
> >> +                       && known_eq (nunits, group_size)
> >> +                       && gap >= (vect_align / scalar_dr_size))
> >>                       {
> >> -                       tree ptr = build_int_cst (ref_type,
> >> -                                                 align * BITS_PER_UNIT);
> >> -                       gcall *call
> >> -                         = gimple_build_call_internal (IFN_MASK_LOAD, 3,
> >> -                                                       dataref_ptr, ptr,
> >> -                                                       final_mask);
> >> -                       gimple_call_set_nothrow (call, true);
> >> -                       new_stmt = call;
> >> -                       data_ref = NULL_TREE;
> >> +                       tree half_vtype;
> >> +                       new_vtype
> >> +                         = vector_vector_composition_type (vectype, 2,
> >> +                                                           &half_vtype);
> >> +                       if (new_vtype != NULL_TREE)
> >> +                         ltype = half_vtype;
> >>                       }
> >> +                   tree offset
> >> +                     = (dataref_offset ? dataref_offset
> >> +                                       : build_int_cst (ref_type, 0));
> >> +                   if (ltype != vectype
> >> +                       && memory_access_type == VMAT_CONTIGUOUS_REVERSE)
> >> +                     {
> >> +                       unsigned HOST_WIDE_INT gap_offset
> >> +                         = gap * tree_to_uhwi (TYPE_SIZE_UNIT 
> >> (elem_type));
> >> +                       tree gapcst = build_int_cst (ref_type, gap_offset);
> >> +                       offset = size_binop (PLUS_EXPR, offset, gapcst);
> >> +                     }
> >> +                   data_ref
> >> +                     = fold_build2 (MEM_REF, ltype, dataref_ptr, offset);
> >> +                   if (alignment_support_scheme == dr_aligned)
> >> +                     ;
> >>                     else
> >> +                     TREE_TYPE (data_ref)
> >> +                       = build_aligned_type (TREE_TYPE (data_ref),
> >> +                                             align * BITS_PER_UNIT);
> >> +                   if (ltype != vectype)
> >>                       {
> >> -                       tree ltype = vectype;
> >> -                       tree new_vtype = NULL_TREE;
> >> -                       unsigned HOST_WIDE_INT gap
> >> -                         = DR_GROUP_GAP (first_stmt_info);
> >> -                       unsigned int vect_align
> >> -                         = vect_known_alignment_in_bytes (first_dr_info,
> >> -                                                          vectype);
> >> -                       unsigned int scalar_dr_size
> >> -                         = vect_get_scalar_dr_size (first_dr_info);
> >> -                       /* If there's no peeling for gaps but we have a gap
> >> -                          with slp loads then load the lower half of the
> >> -                          vector only.  See get_group_load_store_type for
> >> -                          when we apply this optimization.  */
> >> -                       if (slp
> >> -                           && loop_vinfo
> >> -                           && !LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
> >> -                           && gap != 0
> >> -                           && known_eq (nunits, (group_size - gap) * 2)
> >> -                           && known_eq (nunits, group_size)
> >> -                           && gap >= (vect_align / scalar_dr_size))
> >> +                       vect_copy_ref_info (data_ref,
> >> +                                           DR_REF (first_dr_info->dr));
> >> +                       tree tem = make_ssa_name (ltype);
> >> +                       new_stmt = gimple_build_assign (tem, data_ref);
> >> +                       vect_finish_stmt_generation (vinfo, stmt_info, 
> >> new_stmt,
> >> +                                                    gsi);
> >> +                       data_ref = NULL;
> >> +                       vec<constructor_elt, va_gc> *v;
> >> +                       vec_alloc (v, 2);
> >> +                       if (memory_access_type == VMAT_CONTIGUOUS_REVERSE)
> >>                           {
> >> -                           tree half_vtype;
> >> -                           new_vtype
> >> -                             = vector_vector_composition_type (vectype, 2,
> >> -                                                               
> >> &half_vtype);
> >> -                           if (new_vtype != NULL_TREE)
> >> -                             ltype = half_vtype;
> >> +                           CONSTRUCTOR_APPEND_ELT (v, NULL_TREE,
> >> +                                                   build_zero_cst 
> >> (ltype));
> >> +                           CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, tem);
> >>                           }
> >> -                       tree offset
> >> -                         = (dataref_offset ? dataref_offset
> >> -                                           : build_int_cst (ref_type, 0));
> >> -                       if (ltype != vectype
> >> -                           && memory_access_type == 
> >> VMAT_CONTIGUOUS_REVERSE)
> >> +                       else
> >>                           {
> >> -                           unsigned HOST_WIDE_INT gap_offset
> >> -                             = gap * tree_to_uhwi (TYPE_SIZE_UNIT 
> >> (elem_type));
> >> -                           tree gapcst = build_int_cst (ref_type, 
> >> gap_offset);
> >> -                           offset = size_binop (PLUS_EXPR, offset, 
> >> gapcst);
> >> +                           CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, tem);
> >> +                           CONSTRUCTOR_APPEND_ELT (v, NULL_TREE,
> >> +                                                   build_zero_cst 
> >> (ltype));
> >>                           }
> >> -                       data_ref
> >> -                         = fold_build2 (MEM_REF, ltype, dataref_ptr, 
> >> offset);
> >> -                       if (alignment_support_scheme == dr_aligned)
> >> -                         ;
> >> +                       gcc_assert (new_vtype != NULL_TREE);
> >> +                       if (new_vtype == vectype)
> >> +                         new_stmt = gimple_build_assign (
> >> +                           vec_dest, build_constructor (vectype, v));
> >>                         else
> >> -                         TREE_TYPE (data_ref)
> >> -                           = build_aligned_type (TREE_TYPE (data_ref),
> >> -                                                 align * BITS_PER_UNIT);
> >> -                       if (ltype != vectype)
> >>                           {
> >> -                           vect_copy_ref_info (data_ref,
> >> -                                               DR_REF 
> >> (first_dr_info->dr));
> >> -                           tree tem = make_ssa_name (ltype);
> >> -                           new_stmt = gimple_build_assign (tem, data_ref);
> >> +                           tree new_vname = make_ssa_name (new_vtype);
> >> +                           new_stmt = gimple_build_assign (
> >> +                             new_vname, build_constructor (new_vtype, v));
> >>                             vect_finish_stmt_generation (vinfo, stmt_info,
> >>                                                          new_stmt, gsi);
> >> -                           data_ref = NULL;
> >> -                           vec<constructor_elt, va_gc> *v;
> >> -                           vec_alloc (v, 2);
> >> -                           if (memory_access_type == 
> >> VMAT_CONTIGUOUS_REVERSE)
> >> -                             {
> >> -                               CONSTRUCTOR_APPEND_ELT (v, NULL_TREE,
> >> -                                                       build_zero_cst 
> >> (ltype));
> >> -                               CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, tem);
> >> -                             }
> >> -                           else
> >> -                             {
> >> -                               CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, tem);
> >> -                               CONSTRUCTOR_APPEND_ELT (v, NULL_TREE,
> >> -                                                       build_zero_cst 
> >> (ltype));
> >> -                             }
> >> -                           gcc_assert (new_vtype != NULL_TREE);
> >> -                           if (new_vtype == vectype)
> >> -                             new_stmt = gimple_build_assign (
> >> -                               vec_dest, build_constructor (vectype, v));
> >> -                           else
> >> -                             {
> >> -                               tree new_vname = make_ssa_name (new_vtype);
> >> -                               new_stmt = gimple_build_assign (
> >> -                                 new_vname, build_constructor (new_vtype, 
> >> v));
> >> -                               vect_finish_stmt_generation (vinfo, 
> >> stmt_info,
> >> -                                                            new_stmt, 
> >> gsi);
> >> -                               new_stmt = gimple_build_assign (
> >> -                                 vec_dest, build1 (VIEW_CONVERT_EXPR, 
> >> vectype,
> >> -                                                   new_vname));
> >> -                             }
> >> +                           new_stmt = gimple_build_assign (
> >> +                             vec_dest,
> >> +                             build1 (VIEW_CONVERT_EXPR, vectype, 
> >> new_vname));
> >>                           }
> >>                       }
> >> -                   break;
> >>                   }
> >> -               case dr_explicit_realign:
> >> -                 {
> >> -                   if (costing_p)
> >> -                     break;
> >> -                   tree ptr, bump;
> >> -
> >> -                   tree vs = size_int (TYPE_VECTOR_SUBPARTS (vectype));
> >> +               break;
> >> +             }
> >> +           case dr_explicit_realign:
> >> +             {
> >> +               if (costing_p)
> >> +                 break;
> >> +               tree ptr, bump;
> >>
> >> -                   if (compute_in_loop)
> >> -                     msq = vect_setup_realignment (vinfo, 
> >> first_stmt_info, gsi,
> >> -                                                   &realignment_token,
> >> -                                                   dr_explicit_realign,
> >> -                                                   dataref_ptr, NULL);
> >> +               tree vs = size_int (TYPE_VECTOR_SUBPARTS (vectype));
> >>
> >> -                   if (TREE_CODE (dataref_ptr) == SSA_NAME)
> >> -                     ptr = copy_ssa_name (dataref_ptr);
> >> -                   else
> >> -                     ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
> >> -                   // For explicit realign the target alignment should be
> >> -                   // known at compile time.
> >> -                   unsigned HOST_WIDE_INT align =
> >> -                     DR_TARGET_ALIGNMENT (first_dr_info).to_constant ();
> >> -                   new_stmt = gimple_build_assign
> >> -                                (ptr, BIT_AND_EXPR, dataref_ptr,
> >> -                                 build_int_cst
> >> -                                 (TREE_TYPE (dataref_ptr),
> >> -                                  -(HOST_WIDE_INT) align));
> >> -                   vect_finish_stmt_generation (vinfo, stmt_info,
> >> -                                                new_stmt, gsi);
> >> -                   data_ref
> >> -                     = build2 (MEM_REF, vectype, ptr,
> >> -                               build_int_cst (ref_type, 0));
> >> -                   vect_copy_ref_info (data_ref, DR_REF 
> >> (first_dr_info->dr));
> >> -                   vec_dest = vect_create_destination_var (scalar_dest,
> >> -                                                           vectype);
> >> -                   new_stmt = gimple_build_assign (vec_dest, data_ref);
> >> -                   new_temp = make_ssa_name (vec_dest, new_stmt);
> >> -                   gimple_assign_set_lhs (new_stmt, new_temp);
> >> -                   gimple_move_vops (new_stmt, stmt_info->stmt);
> >> -                   vect_finish_stmt_generation (vinfo, stmt_info,
> >> -                                                new_stmt, gsi);
> >> -                   msq = new_temp;
> >> -
> >> -                   bump = size_binop (MULT_EXPR, vs,
> >> -                                      TYPE_SIZE_UNIT (elem_type));
> >> -                   bump = size_binop (MINUS_EXPR, bump, size_one_node);
> >> -                   ptr = bump_vector_ptr (vinfo, dataref_ptr, NULL, gsi,
> >> -                                          stmt_info, bump);
> >> -                   new_stmt = gimple_build_assign
> >> -                                (NULL_TREE, BIT_AND_EXPR, ptr,
> >> -                                 build_int_cst
> >> -                                 (TREE_TYPE (ptr), -(HOST_WIDE_INT) 
> >> align));
> >> -                   if (TREE_CODE (ptr) == SSA_NAME)
> >> -                     ptr = copy_ssa_name (ptr, new_stmt);
> >> -                   else
> >> -                     ptr = make_ssa_name (TREE_TYPE (ptr), new_stmt);
> >> -                   gimple_assign_set_lhs (new_stmt, ptr);
> >> -                   vect_finish_stmt_generation (vinfo, stmt_info,
> >> -                                                new_stmt, gsi);
> >> -                   data_ref
> >> -                     = build2 (MEM_REF, vectype, ptr,
> >> -                               build_int_cst (ref_type, 0));
> >> -                   break;
> >> -                 }
> >> -               case dr_explicit_realign_optimized:
> >> -                 {
> >> -                   if (costing_p)
> >> -                     break;
> >> -                   if (TREE_CODE (dataref_ptr) == SSA_NAME)
> >> -                     new_temp = copy_ssa_name (dataref_ptr);
> >> -                   else
> >> -                     new_temp = make_ssa_name (TREE_TYPE (dataref_ptr));
> >> -                   // We should only be doing this if we know the target
> >> -                   // alignment at compile time.
> >> -                   unsigned HOST_WIDE_INT align =
> >> -                     DR_TARGET_ALIGNMENT (first_dr_info).to_constant ();
> >> -                   new_stmt = gimple_build_assign
> >> -                     (new_temp, BIT_AND_EXPR, dataref_ptr,
> >> -                      build_int_cst (TREE_TYPE (dataref_ptr),
> >> -                                    -(HOST_WIDE_INT) align));
> >> -                   vect_finish_stmt_generation (vinfo, stmt_info,
> >> -                                                new_stmt, gsi);
> >> -                   data_ref
> >> -                     = build2 (MEM_REF, vectype, new_temp,
> >> -                               build_int_cst (ref_type, 0));
> >> -                   break;
> >> -                 }
> >> -               default:
> >> -                 gcc_unreachable ();
> >> -               }
> >> +               if (compute_in_loop)
> >> +                 msq = vect_setup_realignment (vinfo, first_stmt_info, 
> >> gsi,
> >> +                                               &realignment_token,
> >> +                                               dr_explicit_realign,
> >> +                                               dataref_ptr, NULL);
> >> +
> >> +               if (TREE_CODE (dataref_ptr) == SSA_NAME)
> >> +                 ptr = copy_ssa_name (dataref_ptr);
> >> +               else
> >> +                 ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
> >> +               // For explicit realign the target alignment should be
> >> +               // known at compile time.
> >> +               unsigned HOST_WIDE_INT align
> >> +                 = DR_TARGET_ALIGNMENT (first_dr_info).to_constant ();
> >> +               new_stmt = gimple_build_assign (
> >> +                 ptr, BIT_AND_EXPR, dataref_ptr,
> >> +                 build_int_cst (TREE_TYPE (dataref_ptr),
> >> +                                -(HOST_WIDE_INT) align));
> >> +               vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, 
> >> gsi);
> >> +               data_ref
> >> +                 = build2 (MEM_REF, vectype, ptr, build_int_cst 
> >> (ref_type, 0));
> >> +               vect_copy_ref_info (data_ref, DR_REF (first_dr_info->dr));
> >> +               vec_dest = vect_create_destination_var (scalar_dest, 
> >> vectype);
> >> +               new_stmt = gimple_build_assign (vec_dest, data_ref);
> >> +               new_temp = make_ssa_name (vec_dest, new_stmt);
> >> +               gimple_assign_set_lhs (new_stmt, new_temp);
> >> +               gimple_move_vops (new_stmt, stmt_info->stmt);
> >> +               vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, 
> >> gsi);
> >> +               msq = new_temp;
> >> +
> >> +               bump = size_binop (MULT_EXPR, vs, TYPE_SIZE_UNIT 
> >> (elem_type));
> >> +               bump = size_binop (MINUS_EXPR, bump, size_one_node);
> >> +               ptr = bump_vector_ptr (vinfo, dataref_ptr, NULL, gsi, 
> >> stmt_info,
> >> +                                      bump);
> >> +               new_stmt = gimple_build_assign (
> >> +                 NULL_TREE, BIT_AND_EXPR, ptr,
> >> +                 build_int_cst (TREE_TYPE (ptr), -(HOST_WIDE_INT) align));
> >> +               if (TREE_CODE (ptr) == SSA_NAME)
> >> +                 ptr = copy_ssa_name (ptr, new_stmt);
> >> +               else
> >> +                 ptr = make_ssa_name (TREE_TYPE (ptr), new_stmt);
> >> +               gimple_assign_set_lhs (new_stmt, ptr);
> >> +               vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, 
> >> gsi);
> >> +               data_ref
> >> +                 = build2 (MEM_REF, vectype, ptr, build_int_cst 
> >> (ref_type, 0));
> >> +               break;
> >> +             }
> >> +           case dr_explicit_realign_optimized:
> >> +             {
> >> +               if (costing_p)
> >> +                 break;
> >> +               if (TREE_CODE (dataref_ptr) == SSA_NAME)
> >> +                 new_temp = copy_ssa_name (dataref_ptr);
> >> +               else
> >> +                 new_temp = make_ssa_name (TREE_TYPE (dataref_ptr));
> >> +               // We should only be doing this if we know the target
> >> +               // alignment at compile time.
> >> +               unsigned HOST_WIDE_INT align
> >> +                 = DR_TARGET_ALIGNMENT (first_dr_info).to_constant ();
> >> +               new_stmt = gimple_build_assign (
> >> +                 new_temp, BIT_AND_EXPR, dataref_ptr,
> >> +                 build_int_cst (TREE_TYPE (dataref_ptr),
> >> +                                -(HOST_WIDE_INT) align));
> >> +               vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, 
> >> gsi);
> >> +               data_ref = build2 (MEM_REF, vectype, new_temp,
> >> +                                  build_int_cst (ref_type, 0));
> >> +               break;
> >> +             }
> >> +           default:
> >> +             gcc_unreachable ();
> >> +           }
> >>
> >> -             /* One common place to cost the above vect load for different
> >> -                alignment support schemes.  */
> >> -             if (costing_p)
> >> -               {
> >> -                 /* For VMAT_CONTIGUOUS_PERMUTE if it's grouped load, we
> >> -                    only need to take care of the first stmt, whose
> >> -                    stmt_info is first_stmt_info, vec_num iterating on it
> >> -                    will cover the cost for the remaining, it's consistent
> >> -                    with transforming.  For the prologue cost for realign,
> >> -                    we only need to count it once for the whole group.  */
> >> -                 bool first_stmt_info_p = first_stmt_info == stmt_info;
> >> -                 bool add_realign_cost = first_stmt_info_p && i == 0;
> >> -                 if (memory_access_type == VMAT_CONTIGUOUS
> >> -                     || memory_access_type == VMAT_CONTIGUOUS_REVERSE
> >> -                     || (memory_access_type == VMAT_CONTIGUOUS_PERMUTE
> >> -                         && (!grouped_load || first_stmt_info_p)))
> >> -                   vect_get_load_cost (vinfo, stmt_info, 1,
> >> -                                       alignment_support_scheme, 
> >> misalignment,
> >> -                                       add_realign_cost, &inside_cost,
> >> -                                       &prologue_cost, cost_vec, cost_vec,
> >> -                                       true);
> >> -               }
> >> -             else
> >> +         /* One common place to cost the above vect load for different
> >> +            alignment support schemes.  */
> >> +         if (costing_p)
> >> +           {
> >> +             /* For VMAT_CONTIGUOUS_PERMUTE if it's grouped load, we
> >> +                only need to take care of the first stmt, whose
> >> +                stmt_info is first_stmt_info, vec_num iterating on it
> >> +                will cover the cost for the remaining, it's consistent
> >> +                with transforming.  For the prologue cost for realign,
> >> +                we only need to count it once for the whole group.  */
> >> +             bool first_stmt_info_p = first_stmt_info == stmt_info;
> >> +             bool add_realign_cost = first_stmt_info_p && i == 0;
> >> +             if (memory_access_type == VMAT_CONTIGUOUS
> >> +                 || memory_access_type == VMAT_CONTIGUOUS_REVERSE
> >> +                 || (memory_access_type == VMAT_CONTIGUOUS_PERMUTE
> >> +                     && (!grouped_load || first_stmt_info_p)))
> >> +               vect_get_load_cost (vinfo, stmt_info, 1,
> >> +                                   alignment_support_scheme, misalignment,
> >> +                                   add_realign_cost, &inside_cost,
> >> +                                   &prologue_cost, cost_vec, cost_vec, 
> >> true);
> >> +           }
> >> +         else
> >> +           {
> >> +             vec_dest = vect_create_destination_var (scalar_dest, 
> >> vectype);
> >> +             /* DATA_REF is null if we've already built the statement.  */
> >> +             if (data_ref)
> >>                 {
> >> -                 vec_dest = vect_create_destination_var (scalar_dest, 
> >> vectype);
> >> -                 /* DATA_REF is null if we've already built the 
> >> statement.  */
> >> -                 if (data_ref)
> >> -                   {
> >> -                     vect_copy_ref_info (data_ref, DR_REF 
> >> (first_dr_info->dr));
> >> -                     new_stmt = gimple_build_assign (vec_dest, data_ref);
> >> -                   }
> >> -                 new_temp = make_ssa_name (vec_dest, new_stmt);
> >> -                 gimple_set_lhs (new_stmt, new_temp);
> >> -                 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, 
> >> gsi);
> >> +                 vect_copy_ref_info (data_ref, DR_REF 
> >> (first_dr_info->dr));
> >> +                 new_stmt = gimple_build_assign (vec_dest, data_ref);
> >>                 }
> >> +             new_temp = make_ssa_name (vec_dest, new_stmt);
> >> +             gimple_set_lhs (new_stmt, new_temp);
> >> +             vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, 
> >> gsi);
> >> +           }
> >>
> >> -             /* 3. Handle explicit realignment if necessary/supported.
> >> -                Create in loop:
> >> -                  vec_dest = realign_load (msq, lsq, realignment_token)  
> >> */
> >> -             if (!costing_p
> >> -                 && (alignment_support_scheme == 
> >> dr_explicit_realign_optimized
> >> -                     || alignment_support_scheme == dr_explicit_realign))
> >> -               {
> >> -                 lsq = gimple_assign_lhs (new_stmt);
> >> -                 if (!realignment_token)
> >> -                   realignment_token = dataref_ptr;
> >> -                 vec_dest = vect_create_destination_var (scalar_dest, 
> >> vectype);
> >> -                 new_stmt = gimple_build_assign (vec_dest, 
> >> REALIGN_LOAD_EXPR,
> >> -                                                 msq, lsq, 
> >> realignment_token);
> >> -                 new_temp = make_ssa_name (vec_dest, new_stmt);
> >> -                 gimple_assign_set_lhs (new_stmt, new_temp);
> >> -                 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, 
> >> gsi);
> >> +         /* 3. Handle explicit realignment if necessary/supported.
> >> +            Create in loop:
> >> +              vec_dest = realign_load (msq, lsq, realignment_token)  */
> >> +         if (!costing_p
> >> +             && (alignment_support_scheme == dr_explicit_realign_optimized
> >> +                 || alignment_support_scheme == dr_explicit_realign))
> >> +           {
> >> +             lsq = gimple_assign_lhs (new_stmt);
> >> +             if (!realignment_token)
> >> +               realignment_token = dataref_ptr;
> >> +             vec_dest = vect_create_destination_var (scalar_dest, 
> >> vectype);
> >> +             new_stmt = gimple_build_assign (vec_dest, REALIGN_LOAD_EXPR, 
> >> msq,
> >> +                                             lsq, realignment_token);
> >> +             new_temp = make_ssa_name (vec_dest, new_stmt);
> >> +             gimple_assign_set_lhs (new_stmt, new_temp);
> >> +             vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, 
> >> gsi);
> >>
> >> -                 if (alignment_support_scheme == 
> >> dr_explicit_realign_optimized)
> >> -                   {
> >> -                     gcc_assert (phi);
> >> -                     if (i == vec_num - 1 && j == ncopies - 1)
> >> -                       add_phi_arg (phi, lsq,
> >> -                                    loop_latch_edge (containing_loop),
> >> -                                    UNKNOWN_LOCATION);
> >> -                     msq = lsq;
> >> -                   }
> >> +             if (alignment_support_scheme == 
> >> dr_explicit_realign_optimized)
> >> +               {
> >> +                 gcc_assert (phi);
> >> +                 if (i == vec_num - 1 && j == ncopies - 1)
> >> +                   add_phi_arg (phi, lsq, loop_latch_edge 
> >> (containing_loop),
> >> +                                UNKNOWN_LOCATION);
> >> +                 msq = lsq;
> >>                 }
> >> +           }
> >>
> >> -             if (memory_access_type == VMAT_CONTIGUOUS_REVERSE)
> >> +         if (memory_access_type == VMAT_CONTIGUOUS_REVERSE)
> >> +           {
> >> +             if (costing_p)
> >> +               inside_cost = record_stmt_cost (cost_vec, 1, vec_perm,
> >> +                                               stmt_info, 0, vect_body);
> >> +             else
> >>                 {
> >> -                 if (costing_p)
> >> -                   inside_cost = record_stmt_cost (cost_vec, 1, vec_perm,
> >> -                                                   stmt_info, 0, 
> >> vect_body);
> >> -                 else
> >> -                   {
> >> -                     tree perm_mask = perm_mask_for_reverse (vectype);
> >> -                     new_temp
> >> -                       = permute_vec_elements (vinfo, new_temp, new_temp,
> >> -                                               perm_mask, stmt_info, gsi);
> >> -                     new_stmt = SSA_NAME_DEF_STMT (new_temp);
> >> -                   }
> >> +                 tree perm_mask = perm_mask_for_reverse (vectype);
> >> +                 new_temp = permute_vec_elements (vinfo, new_temp, 
> >> new_temp,
> >> +                                                  perm_mask, stmt_info, 
> >> gsi);
> >> +                 new_stmt = SSA_NAME_DEF_STMT (new_temp);
> >>                 }
> >> +           }
> >>
> >> -             /* Collect vector loads and later create their permutation in
> >> -                vect_transform_grouped_load ().  */
> >> -             if (!costing_p && (grouped_load || slp_perm))
> >> -               dr_chain.quick_push (new_temp);
> >> +         /* Collect vector loads and later create their permutation in
> >> +            vect_transform_grouped_load ().  */
> >> +         if (!costing_p && (grouped_load || slp_perm))
> >> +           dr_chain.quick_push (new_temp);
> >>
> >> -             /* Store vector loads in the corresponding SLP_NODE.  */
> >> -             if (!costing_p && slp && !slp_perm)
> >> -               slp_node->push_vec_def (new_stmt);
> >> +         /* Store vector loads in the corresponding SLP_NODE.  */
> >> +         if (!costing_p && slp && !slp_perm)
> >> +           slp_node->push_vec_def (new_stmt);
> >>
> >> -             /* With SLP permutation we load the gaps as well, without
> >> -                we need to skip the gaps after we manage to fully load
> >> -                all elements.  group_gap_adj is DR_GROUP_SIZE here.  */
> >> -             group_elt += nunits;
> >> -             if (!costing_p
> >> -                 && maybe_ne (group_gap_adj, 0U)
> >> -                 && !slp_perm
> >> -                 && known_eq (group_elt, group_size - group_gap_adj))
> >> -               {
> >> -                 poly_wide_int bump_val
> >> -                   = (wi::to_wide (TYPE_SIZE_UNIT (elem_type))
> >> -                      * group_gap_adj);
> >> -                 if (tree_int_cst_sgn
> >> -                       (vect_dr_behavior (vinfo, dr_info)->step) == -1)
> >> -                   bump_val = -bump_val;
> >> -                 tree bump = wide_int_to_tree (sizetype, bump_val);
> >> -                 dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, 
> >> ptr_incr,
> >> -                                                gsi, stmt_info, bump);
> >> -                 group_elt = 0;
> >> -               }
> >> -           }
> >> -         /* Bump the vector pointer to account for a gap or for excess
> >> -            elements loaded for a permuted SLP load.  */
> >> +         /* With SLP permutation we load the gaps as well, without
> >> +            we need to skip the gaps after we manage to fully load
> >> +            all elements.  group_gap_adj is DR_GROUP_SIZE here.  */
> >> +         group_elt += nunits;
> >>           if (!costing_p
> >>               && maybe_ne (group_gap_adj, 0U)
> >> -             && slp_perm)
> >> +             && !slp_perm
> >> +             && known_eq (group_elt, group_size - group_gap_adj))
> >>             {
> >>               poly_wide_int bump_val
> >> -               = (wi::to_wide (TYPE_SIZE_UNIT (elem_type))
> >> -                  * group_gap_adj);
> >> -             if (tree_int_cst_sgn
> >> -                   (vect_dr_behavior (vinfo, dr_info)->step) == -1)
> >> +               = (wi::to_wide (TYPE_SIZE_UNIT (elem_type)) * 
> >> group_gap_adj);
> >> +             if (tree_int_cst_sgn (vect_dr_behavior (vinfo, 
> >> dr_info)->step)
> >> +                 == -1)
> >>                 bump_val = -bump_val;
> >>               tree bump = wide_int_to_tree (sizetype, bump_val);
> >>               dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr, 
> >> gsi,
> >>                                              stmt_info, bump);
> >> +             group_elt = 0;
> >>             }
> >>         }
> >> +      /* Bump the vector pointer to account for a gap or for excess
> >> +        elements loaded for a permuted SLP load.  */
> >> +      if (!costing_p
> >> +         && maybe_ne (group_gap_adj, 0U)
> >> +         && slp_perm)
> >> +       {
> >> +         poly_wide_int bump_val
> >> +           = (wi::to_wide (TYPE_SIZE_UNIT (elem_type)) * group_gap_adj);
> >> +         if (tree_int_cst_sgn (vect_dr_behavior (vinfo, dr_info)->step) 
> >> == -1)
> >> +           bump_val = -bump_val;
> >> +         tree bump = wide_int_to_tree (sizetype, bump_val);
> >> +         dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr, gsi,
> >> +                                        stmt_info, bump);
> >> +       }
> >>
> >>        if (slp && !slp_perm)
> >>         continue;
> >> @@ -11120,39 +11117,36 @@ vectorizable_load (vec_info *vinfo,
> >>             }
> >>         }
> >>        else
> >> -        {
> >> -          if (grouped_load)
> >> -           {
> >> -             if (memory_access_type != VMAT_LOAD_STORE_LANES)
> >> +       {
> >> +         if (grouped_load)
> >> +           {
> >> +             gcc_assert (memory_access_type == VMAT_CONTIGUOUS_PERMUTE);
> >> +             /* We assume that the cost of a single load-lanes instruction
> >> +                is equivalent to the cost of DR_GROUP_SIZE separate loads.
> >> +                If a grouped access is instead being provided by a
> >> +                load-and-permute operation, include the cost of the
> >> +                permutes.  */
> >> +             if (costing_p && first_stmt_info == stmt_info)
> >>                 {
> >> -                 gcc_assert (memory_access_type == 
> >> VMAT_CONTIGUOUS_PERMUTE);
> >> -                 /* We assume that the cost of a single load-lanes 
> >> instruction
> >> -                    is equivalent to the cost of DR_GROUP_SIZE separate 
> >> loads.
> >> -                    If a grouped access is instead being provided by a
> >> -                    load-and-permute operation, include the cost of the
> >> -                    permutes.  */
> >> -                 if (costing_p && first_stmt_info == stmt_info)
> >> -                   {
> >> -                     /* Uses an even and odd extract operations or shuffle
> >> -                        operations for each needed permute.  */
> >> -                     int group_size = DR_GROUP_SIZE (first_stmt_info);
> >> -                     int nstmts = ceil_log2 (group_size) * group_size;
> >> -                     inside_cost
> >> -                       += record_stmt_cost (cost_vec, nstmts, vec_perm,
> >> -                                            stmt_info, 0, vect_body);
> >> +                 /* Uses an even and odd extract operations or shuffle
> >> +                    operations for each needed permute.  */
> >> +                 int group_size = DR_GROUP_SIZE (first_stmt_info);
> >> +                 int nstmts = ceil_log2 (group_size) * group_size;
> >> +                 inside_cost += record_stmt_cost (cost_vec, nstmts, 
> >> vec_perm,
> >> +                                                  stmt_info, 0, 
> >> vect_body);
> >>
> >> -                     if (dump_enabled_p ())
> >> -                       dump_printf_loc (
> >> -                         MSG_NOTE, vect_location,
> >> -                         "vect_model_load_cost: strided group_size = %d 
> >> .\n",
> >> -                         group_size);
> >> -                   }
> >> -                 else if (!costing_p)
> >> -                   vect_transform_grouped_load (vinfo, stmt_info, 
> >> dr_chain,
> >> -                                                group_size, gsi);
> >> +                 if (dump_enabled_p ())
> >> +                   dump_printf_loc (MSG_NOTE, vect_location,
> >> +                                    "vect_model_load_cost:"
> >> +                                    "strided group_size = %d .\n",
> >> +                                    group_size);
> >> +               }
> >> +             else if (!costing_p)
> >> +               {
> >> +                 vect_transform_grouped_load (vinfo, stmt_info, dr_chain,
> >> +                                              group_size, gsi);
> >> +                 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
> >>                 }
> >> -             if (!costing_p)
> >> -               *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
> >>             }
> >>           else if (!costing_p)
> >>             STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
> >> @@ -11166,7 +11160,8 @@ vectorizable_load (vec_info *vinfo,
> >>      {
> >>        gcc_assert (memory_access_type != VMAT_INVARIANT
> >>                   && memory_access_type != VMAT_ELEMENTWISE
> >> -                 && memory_access_type != VMAT_STRIDED_SLP);
> >> +                 && memory_access_type != VMAT_STRIDED_SLP
> >> +                 && memory_access_type != VMAT_LOAD_STORE_LANES);
> >>        if (dump_enabled_p ())
> >>         dump_printf_loc (MSG_NOTE, vect_location,
> >>                          "vect_model_load_cost: inside_cost = %u, "
> >> --
> >> 2.31.1
> >
>

Reply via email to