The following paves the way for more supported load permutations.
I've split it off the patch supporting more permutation as that
requires a load of surgery still :/

Bootstrapped and tested on x86_64-unknown-linux-gnu, applied.

Richard.

2013-04-19  Richard Biener  <rguent...@suse.de>

        * tree-vectorizer.h (struct _slp_instance): Move load_permutation
        member ...
        (struct _slp_tree): ... here.  Make it a vector of unsigned ints.
        (SLP_INSTANCE_LOAD_PERMUTATION): Remove.
        (SLP_TREE_LOAD_PERMUTATION): Add.
        (vect_transform_slp_perm_load): Adjust prototype.
        * tree-vect-slp.c (vect_free_slp_tree): Adjust.
        (vect_free_slp_instance): Likewise.
        (vect_create_new_slp_node): Likewise.
        (vect_supported_slp_permutation_p): Remove.
        (vect_slp_rearrange_stmts): Adjust.
        (vect_supported_load_permutation_p): Likewise.  Inline
        vect_supported_slp_permutation_p here.
        (vect_analyze_slp_instance): Compute load permutations per
        slp node instead of per instance.
        (vect_get_slp_defs): Adjust.
        (vect_transform_slp_perm_load): Likewise.
        (vect_schedule_slp_instance): Remove redundant code.
        (vect_schedule_slp): Remove hack for PR56270, add it ...
        * tree-vect-stmts.c (vectorizable_load): ... here, do not
        CSE loads for SLP.  Adjust.

Index: trunk/gcc/tree-vect-slp.c
===================================================================
*** trunk.orig/gcc/tree-vect-slp.c      2013-04-19 12:43:20.000000000 +0200
--- trunk/gcc/tree-vect-slp.c   2013-04-19 13:04:29.317524077 +0200
*************** vect_free_slp_tree (slp_tree node)
*** 78,83 ****
--- 78,84 ----
    SLP_TREE_CHILDREN (node).release ();
    SLP_TREE_SCALAR_STMTS (node).release ();
    SLP_TREE_VEC_STMTS (node).release ();
+   SLP_TREE_LOAD_PERMUTATION (node).release ();
  
    free (node);
  }
*************** void
*** 89,95 ****
  vect_free_slp_instance (slp_instance instance)
  {
    vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
-   SLP_INSTANCE_LOAD_PERMUTATION (instance).release ();
    SLP_INSTANCE_LOADS (instance).release ();
    SLP_INSTANCE_BODY_COST_VEC (instance).release ();
    free (instance);
--- 90,95 ----
*************** vect_create_new_slp_node (vec<gimple> sc
*** 120,125 ****
--- 120,126 ----
    SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
    SLP_TREE_VEC_STMTS (node).create (0);
    SLP_TREE_CHILDREN (node).create (nops);
+   SLP_TREE_LOAD_PERMUTATION (node) = vNULL;
  
    return node;
  }
*************** vect_mark_slp_stmts_relevant (slp_tree n
*** 1026,1098 ****
  }
  
  
- /* Check if the permutation required by the SLP INSTANCE is supported.
-    Reorganize the SLP nodes stored in SLP_INSTANCE_LOADS if needed.  */
- 
- static bool
- vect_supported_slp_permutation_p (slp_instance instance)
- {
-   slp_tree node = SLP_INSTANCE_LOADS (instance)[0];
-   gimple stmt = SLP_TREE_SCALAR_STMTS (node)[0];
-   gimple first_load = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
-   vec<slp_tree> sorted_loads = vNULL;
-   int index;
-   slp_tree *tmp_loads = NULL;
-   int group_size = SLP_INSTANCE_GROUP_SIZE (instance), i, j;
-   slp_tree load;
- 
-   /* FORNOW: The only supported loads permutation is loads from the same
-      location in all the loads in the node, when the data-refs in
-      nodes of LOADS constitute an interleaving chain.
-      Sort the nodes according to the order of accesses in the chain.  */
-   tmp_loads = (slp_tree *) xmalloc (sizeof (slp_tree) * group_size);
-   for (i = 0, j = 0;
-        SLP_INSTANCE_LOAD_PERMUTATION (instance).iterate (i, &index)
-        && SLP_INSTANCE_LOADS (instance).iterate (j, &load);
-        i += group_size, j++)
-     {
-       gimple scalar_stmt = SLP_TREE_SCALAR_STMTS (load)[0];
-       /* Check that the loads are all in the same interleaving chain.  */
-       if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (scalar_stmt)) != first_load)
-         {
-           if (dump_enabled_p ())
-             {
-               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                              "Build SLP failed: unsupported data "
-                              "permutation ");
-               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
-                               scalar_stmt, 0);
-             }
- 
-           free (tmp_loads);
-           return false;
-         }
- 
-       tmp_loads[index] = load;
-     }
- 
-   sorted_loads.create (group_size);
-   for (i = 0; i < group_size; i++)
-      sorted_loads.safe_push (tmp_loads[i]);
- 
-   SLP_INSTANCE_LOADS (instance).release ();
-   SLP_INSTANCE_LOADS (instance) = sorted_loads;
-   free (tmp_loads);
- 
-   if (!vect_transform_slp_perm_load (stmt, vNULL, NULL,
-                                      SLP_INSTANCE_UNROLLING_FACTOR (instance),
-                                      instance, true))
-     return false;
- 
-   return true;
- }
- 
- 
  /* Rearrange the statements of NODE according to PERMUTATION.  */
  
  static void
  vect_slp_rearrange_stmts (slp_tree node, unsigned int group_size,
!                           vec<int> permutation)
  {
    gimple stmt;
    vec<gimple> tmp_stmts;
--- 1027,1037 ----
  }
  
  
  /* Rearrange the statements of NODE according to PERMUTATION.  */
  
  static void
  vect_slp_rearrange_stmts (slp_tree node, unsigned int group_size,
!                           vec<unsigned> permutation)
  {
    gimple stmt;
    vec<gimple> tmp_stmts;
*************** vect_slp_rearrange_stmts (slp_tree node,
*** 1114,1145 ****
  }
  
  
! /* Check if the required load permutation is supported.
!    LOAD_PERMUTATION contains a list of indices of the loads.
!    In SLP this permutation is relative to the order of grouped stores that are
!    the base of the SLP instance.  */
  
  static bool
! vect_supported_load_permutation_p (slp_instance slp_instn, int group_size,
!                                    vec<int> load_permutation)
  {
!   int i = 0, j, prev = -1, next, k, number_of_groups;
!   bool supported, bad_permutation = false;
    sbitmap load_index;
    slp_tree node;
    gimple stmt, load, next_load, first_load;
    struct data_reference *dr;
-   bb_vec_info bb_vinfo;
- 
-   /* FORNOW: permutations are only supported in SLP.  */
-   if (!slp_instn)
-     return false;
  
    if (dump_enabled_p ())
      {
        dump_printf_loc (MSG_NOTE, vect_location, "Load permutation ");
!       FOR_EACH_VEC_ELT (load_permutation, i, next)
!         dump_printf (MSG_NOTE, "%d ", next);
      }
  
    /* In case of reduction every load permutation is allowed, since the order
--- 1053,1081 ----
  }
  
  
! /* Check if the required load permutations in the SLP instance
!    SLP_INSTN are supported.  */
  
  static bool
! vect_supported_load_permutation_p (slp_instance slp_instn)
  {
!   unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_instn);
!   unsigned int i, j, k, next;
    sbitmap load_index;
    slp_tree node;
    gimple stmt, load, next_load, first_load;
    struct data_reference *dr;
  
    if (dump_enabled_p ())
      {
        dump_printf_loc (MSG_NOTE, vect_location, "Load permutation ");
!       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
!       if (node->load_permutation.exists ())
!         FOR_EACH_VEC_ELT (node->load_permutation, j, next)
!           dump_printf (MSG_NOTE, "%d ", next);
!       else
!         for (i = 0; i < group_size; ++i)
!           dump_printf (MSG_NOTE, "%d ", i);
      }
  
    /* In case of reduction every load permutation is allowed, since the order
*************** vect_supported_load_permutation_p (slp_i
*** 1150,1358 ****
       permutation).  */
  
    /* Check that all the load nodes are of the same size.  */
    FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
      if (SLP_TREE_SCALAR_STMTS (node).length () != (unsigned) group_size)
        return false;
  
    node = SLP_INSTANCE_TREE (slp_instn);
    stmt = SLP_TREE_SCALAR_STMTS (node)[0];
-   /* LOAD_PERMUTATION is a list of indices of all the loads of the SLP
-      instance, not all the loads belong to the same node or interleaving
-      group.  Hence, we need to divide them into groups according to
-      GROUP_SIZE.  */
-   number_of_groups = load_permutation.length () / group_size;
  
    /* Reduction (there are no data-refs in the root).
       In reduction chain the order of the loads is important.  */
    if (!STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))
        && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
      {
!       int first_group_load_index;
! 
!       /* Compare all the permutation sequences to the first one.  */
!       for (i = 1; i < number_of_groups; i++)
!         {
!           k = 0;
!           for (j = i * group_size; j < i * group_size + group_size; j++)
!             {
!               next = load_permutation[j];
!               first_group_load_index = load_permutation[k];
! 
!               if (next != first_group_load_index)
!                 {
!                   bad_permutation = true;
!                   break;
!                 }
  
!               k++;
!             }
! 
!           if (bad_permutation)
!             break;
!         }
! 
!       if (!bad_permutation)
!         {
!           /* Check that the loads in the first sequence are different and 
there
!              are no gaps between them.  */
!           load_index = sbitmap_alloc (group_size);
!           bitmap_clear (load_index);
!           for (k = 0; k < group_size; k++)
!             {
!               first_group_load_index = load_permutation[k];
!               if (bitmap_bit_p (load_index, first_group_load_index))
!                 {
!                   bad_permutation = true;
!                   break;
!                 }
! 
!               bitmap_set_bit (load_index, first_group_load_index);
!             }
! 
!           if (!bad_permutation)
!             for (k = 0; k < group_size; k++)
!               if (!bitmap_bit_p (load_index, k))
!                 {
!                   bad_permutation = true;
!                   break;
!                 }
  
!           sbitmap_free (load_index);
!         }
  
!       if (!bad_permutation)
!         {
!           /* This permutation is valid for reduction.  Since the order of the
!              statements in the nodes is not important unless they are memory
!              accesses, we can rearrange the statements in all the nodes 
!              according to the order of the loads.  */
!           vect_slp_rearrange_stmts (SLP_INSTANCE_TREE (slp_instn), group_size,
!                                     load_permutation);
!           SLP_INSTANCE_LOAD_PERMUTATION (slp_instn).release ();
!           return true;
!         }
      }
  
    /* In basic block vectorization we allow any subchain of an interleaving
       chain.
       FORNOW: not supported in loop SLP because of realignment compications.  
*/
!   bb_vinfo = STMT_VINFO_BB_VINFO (vinfo_for_stmt (stmt));
!   bad_permutation = false;
!   /* Check that for every node in the instance the loads form a subchain.  */
!   if (bb_vinfo)
      {
        FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
          {
            next_load = NULL;
-           first_load = NULL;
            FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load)
              {
-               if (!first_load)
-                 first_load = GROUP_FIRST_ELEMENT (vinfo_for_stmt (load));
-               else if (first_load
-                          != GROUP_FIRST_ELEMENT (vinfo_for_stmt (load)))
-                 {
-                   bad_permutation = true;
-                 break;
-               }
- 
                if (j != 0 && next_load != load)
!                 {
!                   bad_permutation = true;
!                   break;
!                 }
! 
                next_load = GROUP_NEXT_ELEMENT (vinfo_for_stmt (load));
              }
- 
-           if (bad_permutation)
-             break;
          }
  
        /* Check that the alignment of the first load in every subchain, i.e.,
!          the first statement in every load node, is supported.  */
!       if (!bad_permutation)
!         {
!           FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
!             {
!               first_load = SLP_TREE_SCALAR_STMTS (node)[0];
!               if (first_load
!                     != GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_load)))
!                 {
!                   dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_load));
!                   if (vect_supportable_dr_alignment (dr, false)
!                      == dr_unaligned_unsupported)
!                     {
!                     if (dump_enabled_p ())
!                       {
!                         dump_printf_loc (MSG_MISSED_OPTIMIZATION,
!                                          vect_location, 
!                                          "unsupported unaligned load ");
!                           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
!                                           first_load, 0);
!                         }
!                     bad_permutation = true;
!                       break;
!                     }
!               }
!             }
  
!           if (!bad_permutation)
!             {
!               SLP_INSTANCE_LOAD_PERMUTATION (slp_instn).release ();
!               return true;
!           }
!         }
      }
  
    /* FORNOW: the only supported permutation is 0..01..1.. of length equal to
       GROUP_SIZE and where each sequence of same drs is of GROUP_SIZE length as
       well (unless it's reduction).  */
!   if (load_permutation.length ()
!       != (unsigned int) (group_size * group_size))
      return false;
  
-   supported = true;
    load_index = sbitmap_alloc (group_size);
    bitmap_clear (load_index);
!   for (j = 0; j < group_size; j++)
      {
!       for (i = j * group_size, k = 0;
!            load_permutation.iterate (i, &next) && k < group_size;
!            i++, k++)
!        {
!          if (i != j * group_size && next != prev)
!           {
!             supported = false;
!             break;
!           }
! 
!          prev = next;
!        }
! 
!       if (bitmap_bit_p (load_index, prev))
!         {
!           supported = false;
!           break;
!         }
! 
!       bitmap_set_bit (load_index, prev);
      }
!  
!   for (j = 0; j < group_size; j++)
!     if (!bitmap_bit_p (load_index, j))
        {
        sbitmap_free (load_index);
        return false;
        }
- 
    sbitmap_free (load_index);
  
!   if (supported && i == group_size * group_size
!       && vect_supported_slp_permutation_p (slp_instn))
!     return true;
! 
!   return false;
  }
  
  
--- 1086,1246 ----
       permutation).  */
  
    /* Check that all the load nodes are of the same size.  */
+   /* ???  Can't we assert this? */
    FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
      if (SLP_TREE_SCALAR_STMTS (node).length () != (unsigned) group_size)
        return false;
  
    node = SLP_INSTANCE_TREE (slp_instn);
    stmt = SLP_TREE_SCALAR_STMTS (node)[0];
  
    /* Reduction (there are no data-refs in the root).
       In reduction chain the order of the loads is important.  */
    if (!STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))
        && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
      {
!       slp_tree load;
!       unsigned int lidx;
  
!       /* Compare all the permutation sequences to the first one.  We know
!          that at least one load is permuted.  */
!       node = SLP_INSTANCE_LOADS (slp_instn)[0];
!       if (!node->load_permutation.exists ())
!       return false;
!       for (i = 1; SLP_INSTANCE_LOADS (slp_instn).iterate (i, &load); ++i)
!       {
!         if (!load->load_permutation.exists ())
!           return false;
!         FOR_EACH_VEC_ELT (load->load_permutation, j, lidx)
!           if (lidx != node->load_permutation[j])
!             return false;
!       }
  
!       /* Check that the loads in the first sequence are different and there
!        are no gaps between them.  */
!       load_index = sbitmap_alloc (group_size);
!       bitmap_clear (load_index);
!       FOR_EACH_VEC_ELT (node->load_permutation, i, lidx)
!       {
!         if (bitmap_bit_p (load_index, lidx))
!           {
!             sbitmap_free (load_index);
!             return false;
!           }
!         bitmap_set_bit (load_index, lidx);
!       }
!       for (i = 0; i < group_size; i++)
!       if (!bitmap_bit_p (load_index, i))
!         {
!           sbitmap_free (load_index);
!           return false;
!         }
!       sbitmap_free (load_index);
! 
!       /* This permutation is valid for reduction.  Since the order of the
!        statements in the nodes is not important unless they are memory
!        accesses, we can rearrange the statements in all the nodes
!        according to the order of the loads.  */
!       vect_slp_rearrange_stmts (SLP_INSTANCE_TREE (slp_instn), group_size,
!                               node->load_permutation);
  
!       /* We are done, no actual permutations need to be generated.  */
!       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
!       SLP_TREE_LOAD_PERMUTATION (node).release ();
!       return true;
      }
  
    /* In basic block vectorization we allow any subchain of an interleaving
       chain.
       FORNOW: not supported in loop SLP because of realignment compications.  
*/
!   if (STMT_VINFO_BB_VINFO (vinfo_for_stmt (stmt)))
      {
+       /* Check that for every node in the instance the loads
+        form a subchain.  */
        FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
          {
            next_load = NULL;
            FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load)
              {
                if (j != 0 && next_load != load)
!               return false;
                next_load = GROUP_NEXT_ELEMENT (vinfo_for_stmt (load));
              }
          }
  
        /* Check that the alignment of the first load in every subchain, i.e.,
!          the first statement in every load node, is supported.
!        ???  This belongs in alignment checking.  */
!       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
!       {
!         first_load = SLP_TREE_SCALAR_STMTS (node)[0];
!         if (first_load != GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_load)))
!           {
!             dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_load));
!             if (vect_supportable_dr_alignment (dr, false)
!                 == dr_unaligned_unsupported)
!               {
!                 if (dump_enabled_p ())
!                   {
!                     dump_printf_loc (MSG_MISSED_OPTIMIZATION,
!                                      vect_location,
!                                      "unsupported unaligned load ");
!                     dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
!                                       first_load, 0);
!                   }
!                 return false;
!               }
!           }
!       }
  
!       /* We are done, no actual permutations need to be generated.  */
!       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
!       SLP_TREE_LOAD_PERMUTATION (node).release ();
!       return true;
      }
  
    /* FORNOW: the only supported permutation is 0..01..1.. of length equal to
       GROUP_SIZE and where each sequence of same drs is of GROUP_SIZE length as
       well (unless it's reduction).  */
!   if (SLP_INSTANCE_LOADS (slp_instn).length () != group_size)
      return false;
+   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
+     if (!node->load_permutation.exists ())
+       return false;
  
    load_index = sbitmap_alloc (group_size);
    bitmap_clear (load_index);
!   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
      {
!       unsigned int lidx = node->load_permutation[0];
!       if (bitmap_bit_p (load_index, lidx))
!       {
!         sbitmap_free (load_index);
!         return false;
!       }
!       bitmap_set_bit (load_index, lidx);
!       FOR_EACH_VEC_ELT (node->load_permutation, j, k)
!       if (k != lidx)
!         {
!           sbitmap_free (load_index);
!           return false;
!         }
      }
!   for (i = 0; i < group_size; i++)
!     if (!bitmap_bit_p (load_index, i))
        {
        sbitmap_free (load_index);
        return false;
        }
    sbitmap_free (load_index);
  
!   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
!     if (node->load_permutation.exists ()
!       && !vect_transform_slp_perm_load
!             (node, vNULL, NULL,
!              SLP_INSTANCE_UNROLLING_FACTOR (slp_instn), slp_instn, true))
!       return false;
!   return true;
  }
  
  
*************** vect_analyze_slp_instance (loop_vec_info
*** 1642,1658 ****
        SLP_INSTANCE_BODY_COST_VEC (new_instance) = vNULL;
        SLP_INSTANCE_LOADS (new_instance) = loads;
        SLP_INSTANCE_FIRST_LOAD_STMT (new_instance) = NULL;
-       SLP_INSTANCE_LOAD_PERMUTATION (new_instance) = vNULL;
  
        /* Compute the load permutation.  */
        slp_tree load_node;
        bool loads_permuted = false;
-       vec<int> load_permutation;
-       load_permutation.create (group_size * group_size);
        FOR_EACH_VEC_ELT (loads, i, load_node)
        {
          int j;
          gimple load, first_stmt;
          first_stmt = GROUP_FIRST_ELEMENT
              (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (load_node)[0]));
          FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load)
--- 1530,1546 ----
        SLP_INSTANCE_BODY_COST_VEC (new_instance) = vNULL;
        SLP_INSTANCE_LOADS (new_instance) = loads;
        SLP_INSTANCE_FIRST_LOAD_STMT (new_instance) = NULL;
  
        /* Compute the load permutation.  */
        slp_tree load_node;
        bool loads_permuted = false;
        FOR_EACH_VEC_ELT (loads, i, load_node)
        {
+         vec<unsigned> load_permutation;
          int j;
          gimple load, first_stmt;
+         bool this_load_permuted = false;
+         load_permutation.create (group_size);
          first_stmt = GROUP_FIRST_ELEMENT
              (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (load_node)[0]));
          FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load)
*************** vect_analyze_slp_instance (loop_vec_info
*** 1661,1676 ****
                = vect_get_place_in_interleaving_chain (load, first_stmt);
              gcc_assert (load_place != -1);
              if (load_place != j)
!               loads_permuted = true;
              load_permutation.safe_push (load_place);
            }
        }
  
        if (loads_permuted)
          {
!         SLP_INSTANCE_LOAD_PERMUTATION (new_instance) = load_permutation;
!           if (!vect_supported_load_permutation_p (new_instance, group_size,
!                                                   load_permutation))
              {
                if (dump_enabled_p ())
                  {
--- 1549,1569 ----
                = vect_get_place_in_interleaving_chain (load, first_stmt);
              gcc_assert (load_place != -1);
              if (load_place != j)
!               this_load_permuted = true;
              load_permutation.safe_push (load_place);
            }
+         if (!this_load_permuted)
+           {
+             load_permutation.release ();
+             continue;
+           }
+         SLP_TREE_LOAD_PERMUTATION (load_node) = load_permutation;
+         loads_permuted = true;
        }
  
        if (loads_permuted)
          {
!           if (!vect_supported_load_permutation_p (new_instance))
              {
                if (dump_enabled_p ())
                  {
*************** vect_analyze_slp_instance (loop_vec_info
*** 1679,1694 ****
                                   "permutation ");
                    dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 
0);
                  }
- 
                vect_free_slp_instance (new_instance);
                return false;
              }
  
            SLP_INSTANCE_FIRST_LOAD_STMT (new_instance)
!              = vect_find_first_load_in_slp_instance (new_instance);
          }
-       else
-         load_permutation.release ();
  
        /* Compute the costs of this SLP instance.  */
        vect_analyze_slp_cost (loop_vinfo, bb_vinfo,
--- 1572,1584 ----
                                   "permutation ");
                    dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 
0);
                  }
                vect_free_slp_instance (new_instance);
                return false;
              }
  
            SLP_INSTANCE_FIRST_LOAD_STMT (new_instance)
!           = vect_find_first_load_in_slp_instance (new_instance);
          }
  
        /* Compute the costs of this SLP instance.  */
        vect_analyze_slp_cost (loop_vinfo, bb_vinfo,
*************** vect_get_slp_defs (vec<tree> ops, slp_tr
*** 2653,2659 ****
        vectorized_defs = false;
        if (SLP_TREE_CHILDREN (slp_node).length () > child_index)
          {
!           child = (slp_tree) SLP_TREE_CHILDREN (slp_node)[child_index];
  
          /* We have to check both pattern and original def, if available.  */
          gimple first_def = SLP_TREE_SCALAR_STMTS (child)[0];
--- 2543,2549 ----
        vectorized_defs = false;
        if (SLP_TREE_CHILDREN (slp_node).length () > child_index)
          {
!           child = SLP_TREE_CHILDREN (slp_node)[child_index];
  
          /* We have to check both pattern and original def, if available.  */
          gimple first_def = SLP_TREE_SCALAR_STMTS (child)[0];
*************** vect_get_mask_element (gimple stmt, int
*** 2854,2869 ****
  
  /* Generate vector permute statements from a list of loads in DR_CHAIN.
     If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
!    permute statements for SLP_NODE_INSTANCE.  */
  bool
! vect_transform_slp_perm_load (gimple stmt, vec<tree> dr_chain,
                                gimple_stmt_iterator *gsi, int vf,
                                slp_instance slp_node_instance, bool 
analyze_only)
  {
    stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
    tree mask_element_type = NULL_TREE, mask_type;
    int i, j, k, nunits, vec_index = 0, scalar_index;
-   slp_tree node;
    tree vectype = STMT_VINFO_VECTYPE (stmt_info);
    gimple next_scalar_stmt;
    int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance);
--- 2744,2761 ----
  
  /* Generate vector permute statements from a list of loads in DR_CHAIN.
     If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
!    permute statements for the SLP node NODE of the SLP instance
!    SLP_NODE_INSTANCE.  */
! 
  bool
! vect_transform_slp_perm_load (slp_tree node, vec<tree> dr_chain,
                                gimple_stmt_iterator *gsi, int vf,
                                slp_instance slp_node_instance, bool 
analyze_only)
  {
+   gimple stmt = SLP_TREE_SCALAR_STMTS (node)[0];
    stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
    tree mask_element_type = NULL_TREE, mask_type;
    int i, j, k, nunits, vec_index = 0, scalar_index;
    tree vectype = STMT_VINFO_VECTYPE (stmt_info);
    gimple next_scalar_stmt;
    int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance);
*************** vect_transform_slp_perm_load (gimple stm
*** 2910,2915 ****
--- 2802,2810 ----
       relatively to SLP_NODE_INSTANCE unrolling factor.  */
    ncopies = vf / SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance);
  
+   if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
+     return false;
+ 
    /* Generate permutation masks for every NODE. Number of masks for each NODE
       is equal to GROUP_SIZE.
       E.g., we have a group of three nodes with three loads from the same
*************** vect_transform_slp_perm_load (gimple stm
*** 2928,2934 ****
       we need the second and the third vectors: {b1,c1,a2,b2} and
       {c2,a3,b3,c3}.  */
  
-   FOR_EACH_VEC_ELT  (SLP_INSTANCE_LOADS (slp_node_instance), i, node)
      {
        scalar_index = 0;
        index = 0;
--- 2823,2828 ----
*************** vect_transform_slp_perm_load (gimple stm
*** 2944,2949 ****
--- 2838,2844 ----
          {
            for (k = 0; k < group_size; k++)
              {
+             i = SLP_TREE_LOAD_PERMUTATION (node)[k];
                first_mask_element = i + j * group_size;
                if (!vect_get_mask_element (stmt, first_mask_element, 0,
                                          nunits, only_one_vec, index,
*************** vect_transform_slp_perm_load (gimple stm
*** 2956,2964 ****
  
                if (index == nunits)
                  {
!                 tree mask_vec, *mask_elts;
!                 int l;
! 
                  if (!can_vec_perm_p (mode, false, mask))
                    {
                      if (dump_enabled_p ())
--- 2851,2857 ----
  
                if (index == nunits)
                  {
!                 index = 0;
                  if (!can_vec_perm_p (mode, false, mask))
                    {
                      if (dump_enabled_p ())
*************** vect_transform_slp_perm_load (gimple stm
*** 2974,2988 ****
                      return false;
                    }
  
-                 mask_elts = XALLOCAVEC (tree, nunits);
-                 for (l = 0; l < nunits; ++l)
-                   mask_elts[l] = build_int_cst (mask_element_type, mask[l]);
-                 mask_vec = build_vector (mask_type, mask_elts);
-                 index = 0;
- 
                    if (!analyze_only)
                      {
!                       if (need_next_vector)
                          {
                            first_vec_index = second_vec_index;
                            second_vec_index = vec_index;
--- 2867,2883 ----
                      return false;
                    }
  
                    if (!analyze_only)
                      {
!                     int l;
!                     tree mask_vec, *mask_elts;
!                     mask_elts = XALLOCAVEC (tree, nunits);
!                     for (l = 0; l < nunits; ++l)
!                       mask_elts[l] = build_int_cst (mask_element_type,
!                                                     mask[l]);
!                     mask_vec = build_vector (mask_type, mask_elts);
! 
!                     if (need_next_vector)
                          {
                            first_vec_index = second_vec_index;
                            second_vec_index = vec_index;
*************** vect_schedule_slp_instance (slp_tree nod
*** 3019,3025 ****
    unsigned int vec_stmts_size, nunits, group_size;
    tree vectype;
    int i;
-   slp_tree loads_node;
    slp_tree child;
  
    if (!node)
--- 2914,2919 ----
*************** vect_schedule_slp_instance (slp_tree nod
*** 3043,3062 ****
       size.  */
    vec_stmts_size = (vectorization_factor * group_size) / nunits;
  
-   /* In case of load permutation we have to allocate vectorized statements for
-      all the nodes that participate in that permutation.  */
-   if (SLP_INSTANCE_LOAD_PERMUTATION (instance).exists ())
-     {
-       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, loads_node)
-         {
-           if (!SLP_TREE_VEC_STMTS (loads_node).exists ())
-             {
-               SLP_TREE_VEC_STMTS (loads_node).create (vec_stmts_size);
-               SLP_TREE_NUMBER_OF_VEC_STMTS (loads_node) = vec_stmts_size;
-             }
-         }
-     }
- 
    if (!SLP_TREE_VEC_STMTS (node).exists ())
      {
        SLP_TREE_VEC_STMTS (node).create (vec_stmts_size);
--- 2937,2942 ----
*************** vect_schedule_slp_instance (slp_tree nod
*** 3074,3080 ****
    if (SLP_INSTANCE_FIRST_LOAD_STMT (instance)
        && STMT_VINFO_GROUPED_ACCESS (stmt_info)
        && !REFERENCE_CLASS_P (gimple_get_lhs (stmt))
!       && SLP_INSTANCE_LOAD_PERMUTATION (instance).exists ())
      si = gsi_for_stmt (SLP_INSTANCE_FIRST_LOAD_STMT (instance));
    else if (is_pattern_stmt_p (stmt_info))
      si = gsi_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
--- 2954,2960 ----
    if (SLP_INSTANCE_FIRST_LOAD_STMT (instance)
        && STMT_VINFO_GROUPED_ACCESS (stmt_info)
        && !REFERENCE_CLASS_P (gimple_get_lhs (stmt))
!       && SLP_TREE_LOAD_PERMUTATION (node).exists ())
      si = gsi_for_stmt (SLP_INSTANCE_FIRST_LOAD_STMT (instance));
    else if (is_pattern_stmt_p (stmt_info))
      si = gsi_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
*************** vect_schedule_slp (loop_vec_info loop_vi
*** 3153,3160 ****
  {
    vec<slp_instance> slp_instances;
    slp_instance instance;
!   slp_tree loads_node;
!   unsigned int i, j, vf;
    bool is_store = false;
  
    if (loop_vinfo)
--- 3033,3039 ----
  {
    vec<slp_instance> slp_instances;
    slp_instance instance;
!   unsigned int i, vf;
    bool is_store = false;
  
    if (loop_vinfo)
*************** vect_schedule_slp (loop_vec_info loop_vi
*** 3173,3186 ****
        /* Schedule the tree of INSTANCE.  */
        is_store = vect_schedule_slp_instance (SLP_INSTANCE_TREE (instance),
                                               instance, vf);
- 
-       /* Clear STMT_VINFO_VEC_STMT of all loads.  With shared loads
-          between SLP instances we fail to properly initialize the
-        vectorized SLP stmts and confuse different load permutations.  */
-       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, loads_node)
-       STMT_VINFO_VEC_STMT
-         (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (loads_node)[0])) = NULL;
- 
        if (dump_enabled_p ())
        dump_printf_loc (MSG_NOTE, vect_location,
                           "vectorizing stmts using SLP.");
--- 3052,3057 ----
Index: trunk/gcc/tree-vect-stmts.c
===================================================================
*** trunk.orig/gcc/tree-vect-stmts.c    2013-04-19 12:43:20.000000000 +0200
--- trunk/gcc/tree-vect-stmts.c 2013-04-19 13:02:46.114368141 +0200
*************** vectorizable_load (gimple stmt, gimple_s
*** 4754,4765 ****
      {
        first_stmt = GROUP_FIRST_ELEMENT (stmt_info);
        if (slp
!           && !SLP_INSTANCE_LOAD_PERMUTATION (slp_node_instance).exists ()
          && first_stmt != SLP_TREE_SCALAR_STMTS (slp_node)[0])
          first_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[0];
  
        /* Check if the chain of loads is already vectorized.  */
!       if (STMT_VINFO_VEC_STMT (vinfo_for_stmt (first_stmt)))
        {
          *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
          return true;
--- 4754,4774 ----
      {
        first_stmt = GROUP_FIRST_ELEMENT (stmt_info);
        if (slp
!           && !SLP_TREE_LOAD_PERMUTATION (slp_node).exists ()
          && first_stmt != SLP_TREE_SCALAR_STMTS (slp_node)[0])
          first_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[0];
  
        /* Check if the chain of loads is already vectorized.  */
!       if (STMT_VINFO_VEC_STMT (vinfo_for_stmt (first_stmt))
!         /* For SLP we would need to copy over SLP_TREE_VEC_STMTS.
!            ???  But we can only do so if there is exactly one
!            as we have no way to get at the rest.  Leave the CSE
!            opportunity alone.
!            ???  With the group load eventually participating
!            in multiple different permutations (having multiple
!            slp nodes which refer to the same group) the CSE
!            is even wrong code.  See PR56270.  */
!         && !slp)
        {
          *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
          return true;
*************** vectorizable_load (gimple stmt, gimple_s
*** 4772,4778 ****
        {
          grouped_load = false;
          vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
!           if (SLP_INSTANCE_LOAD_PERMUTATION (slp_node_instance).exists ())
              slp_perm = true;
          group_gap = GROUP_GAP (vinfo_for_stmt (first_stmt));
        }
--- 4781,4787 ----
        {
          grouped_load = false;
          vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
!           if (SLP_TREE_LOAD_PERMUTATION (slp_node).exists ())
              slp_perm = true;
          group_gap = GROUP_GAP (vinfo_for_stmt (first_stmt));
        }
*************** vectorizable_load (gimple stmt, gimple_s
*** 5163,5169 ****
  
        if (slp_perm)
          {
!           if (!vect_transform_slp_perm_load (stmt, dr_chain, gsi, vf,
                                               slp_node_instance, false))
              {
                dr_chain.release ();
--- 5172,5178 ----
  
        if (slp_perm)
          {
!           if (!vect_transform_slp_perm_load (slp_node, dr_chain, gsi, vf,
                                               slp_node_instance, false))
              {
                dr_chain.release ();
Index: trunk/gcc/tree-vectorizer.h
===================================================================
*** trunk.orig/gcc/tree-vectorizer.h    2013-04-19 12:43:20.000000000 +0200
--- trunk/gcc/tree-vectorizer.h 2013-04-19 12:48:52.156018695 +0200
*************** struct _slp_tree {
*** 106,111 ****
--- 106,114 ----
    vec<slp_tree> children;
    /* A group of scalar stmts to be vectorized together.  */
    vec<gimple> stmts;
+   /* Load permutation relative to the stores, NULL if there is no
+      permutation.  */
+   vec<unsigned> load_permutation;
    /* Vectorized stmt/s.  */
    vec<gimple> vec_stmts;
    /* Number of vector stmts that are created to replace the group of scalar
*************** typedef struct _slp_instance {
*** 131,140 ****
    /* Vectorization costs associated with SLP instance.  */
    stmt_vector_for_cost body_cost_vec;
  
-   /* Loads permutation relatively to the stores, NULL if there is no
-      permutation.  */
-   vec<int> load_permutation;
- 
    /* The group of nodes that contain loads of this SLP instance.  */
    vec<slp_tree> loads;
  
--- 134,139 ----
*************** typedef struct _slp_instance {
*** 149,155 ****
  #define SLP_INSTANCE_GROUP_SIZE(S)               (S)->group_size
  #define SLP_INSTANCE_UNROLLING_FACTOR(S)         (S)->unrolling_factor
  #define SLP_INSTANCE_BODY_COST_VEC(S)            (S)->body_cost_vec
- #define SLP_INSTANCE_LOAD_PERMUTATION(S)         (S)->load_permutation
  #define SLP_INSTANCE_LOADS(S)                    (S)->loads
  #define SLP_INSTANCE_FIRST_LOAD_STMT(S)          (S)->first_load
  
--- 148,153 ----
*************** typedef struct _slp_instance {
*** 157,162 ****
--- 155,161 ----
  #define SLP_TREE_SCALAR_STMTS(S)                 (S)->stmts
  #define SLP_TREE_VEC_STMTS(S)                    (S)->vec_stmts
  #define SLP_TREE_NUMBER_OF_VEC_STMTS(S)          (S)->vec_stmts_size
+ #define SLP_TREE_LOAD_PERMUTATION(S)             (S)->load_permutation
  
  /* This structure is used in creation of an SLP tree.  Each instance
     corresponds to the same operand in a group of scalar stmts in an SLP
*************** extern int vect_get_single_scalar_iterat
*** 961,967 ****
  
  /* In tree-vect-slp.c.  */
  extern void vect_free_slp_instance (slp_instance);
! extern bool vect_transform_slp_perm_load (gimple, vec<tree> ,
                                            gimple_stmt_iterator *, int,
                                            slp_instance, bool);
  extern bool vect_schedule_slp (loop_vec_info, bb_vec_info);
--- 960,966 ----
  
  /* In tree-vect-slp.c.  */
  extern void vect_free_slp_instance (slp_instance);
! extern bool vect_transform_slp_perm_load (slp_tree, vec<tree> ,
                                            gimple_stmt_iterator *, int,
                                            slp_instance, bool);
  extern bool vect_schedule_slp (loop_vec_info, bb_vec_info);

Reply via email to