This removes the restriction in place on reductions.  To not regress
gfortran.dg/vect/fast-math-pr37021.f90 the patch also implements
SLP permutations for strided group loads.  With those two pieces
we can finally SLP vectorize the complex multiplication (which
in the gfortran.dg/vect/fast-math-pr37021.f90 testcase uses
variable stride).  Code generated on x86_64 suffers from PR56766,
combine not being able to use sse3_addsubv2df3 because the
RTL doesn't use the expected vec_merge but
(vec_select (vec_concat )) instead.

Previously we vectorized this with unrolling and interleaving.

Bootstrap and regtest running on x86_64-unknown-linux-gnu.

Richard.

2015-06-09  Richard Biener  <rguent...@suse.de>

        * tree-vect-slp.c (vect_attempt_slp_rearrange_stmts): Split
        out from ...
        (vect_supported_load_permutation_p): ... here.  Handle
        supportable permutations in reductions.
        * tree-vect-stmts.c (vectorizable_load): Handle SLP permutations
        for vectorizing strided group loads.

Index: gcc/tree-vect-slp.c
===================================================================
*** gcc/tree-vect-slp.c (revision 224271)
--- gcc/tree-vect-slp.c (working copy)
*************** vect_slp_rearrange_stmts (slp_tree node,
*** 1326,1331 ****
--- 1270,1336 ----
  }
  
  
+ /* Attempt to reorder stmts in a reduction chain so that we don't
+    require any load permutation.  Return true if that was possible,
+    otherwise return false.  */
+ 
+ static bool
+ vect_attempt_slp_rearrange_stmts (slp_instance slp_instn)
+ {
+   unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_instn);
+   unsigned int i, j;
+   sbitmap load_index;
+   unsigned int lidx;
+   slp_tree node, load;
+ 
+   /* Compare all the permutation sequences to the first one.  We know
+      that at least one load is permuted.  */
+   node = SLP_INSTANCE_LOADS (slp_instn)[0];
+   if (!node->load_permutation.exists ())
+     return false;
+   for (i = 1; SLP_INSTANCE_LOADS (slp_instn).iterate (i, &load); ++i)
+     {
+       if (!load->load_permutation.exists ())
+       return false;
+       FOR_EACH_VEC_ELT (load->load_permutation, j, lidx)
+       if (lidx != node->load_permutation[j])
+         return false;
+     }
+ 
+   /* Check that the loads in the first sequence are different and there
+      are no gaps between them.  */
+   load_index = sbitmap_alloc (group_size);
+   bitmap_clear (load_index);
+   FOR_EACH_VEC_ELT (node->load_permutation, i, lidx)
+     {
+       if (bitmap_bit_p (load_index, lidx))
+       {
+         sbitmap_free (load_index);
+         return false;
+       }
+       bitmap_set_bit (load_index, lidx);
+     }
+   for (i = 0; i < group_size; i++)
+     if (!bitmap_bit_p (load_index, i))
+       {
+       sbitmap_free (load_index);
+       return false;
+       }
+   sbitmap_free (load_index);
+ 
+   /* This permutation is valid for reduction.  Since the order of the
+      statements in the nodes is not important unless they are memory
+      accesses, we can rearrange the statements in all the nodes
+      according to the order of the loads.  */
+   vect_slp_rearrange_stmts (SLP_INSTANCE_TREE (slp_instn), group_size,
+                           node->load_permutation);
+ 
+   /* We are done, no actual permutations need to be generated.  */
+   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
+     SLP_TREE_LOAD_PERMUTATION (node).release ();
+   return true;
+ }
+ 
  /* Check if the required load permutations in the SLP instance
     SLP_INSTN are supported.  */
  
*************** vect_supported_load_permutation_p (slp_i
*** 1334,1340 ****
  {
    unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_instn);
    unsigned int i, j, k, next;
-   sbitmap load_index;
    slp_tree node;
    gimple stmt, load, next_load, first_load;
    struct data_reference *dr;
--- 1339,1344 ----
*************** vect_supported_load_permutation_p (slp_i
*** 1369,1427 ****
    stmt = SLP_TREE_SCALAR_STMTS (node)[0];
  
    /* Reduction (there are no data-refs in the root).
!      In reduction chain the order of the loads is important.  */
    if (!STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))
        && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
      {
!       slp_tree load;
!       unsigned int lidx;
! 
!       /* Compare all the permutation sequences to the first one.  We know
!          that at least one load is permuted.  */
!       node = SLP_INSTANCE_LOADS (slp_instn)[0];
!       if (!node->load_permutation.exists ())
!       return false;
!       for (i = 1; SLP_INSTANCE_LOADS (slp_instn).iterate (i, &load); ++i)
!       {
!         if (!load->load_permutation.exists ())
!           return false;
!         FOR_EACH_VEC_ELT (load->load_permutation, j, lidx)
!           if (lidx != node->load_permutation[j])
!             return false;
!       }
  
!       /* Check that the loads in the first sequence are different and there
!        are no gaps between them.  */
!       load_index = sbitmap_alloc (group_size);
!       bitmap_clear (load_index);
!       FOR_EACH_VEC_ELT (node->load_permutation, i, lidx)
!       {
!         if (bitmap_bit_p (load_index, lidx))
!           {
!             sbitmap_free (load_index);
!             return false;
!           }
!         bitmap_set_bit (load_index, lidx);
!       }
!       for (i = 0; i < group_size; i++)
!       if (!bitmap_bit_p (load_index, i))
!         {
!           sbitmap_free (load_index);
!           return false;
!         }
!       sbitmap_free (load_index);
! 
!       /* This permutation is valid for reduction.  Since the order of the
!        statements in the nodes is not important unless they are memory
!        accesses, we can rearrange the statements in all the nodes
!        according to the order of the loads.  */
!       vect_slp_rearrange_stmts (SLP_INSTANCE_TREE (slp_instn), group_size,
!                               node->load_permutation);
! 
!       /* We are done, no actual permutations need to be generated.  */
!       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
!       SLP_TREE_LOAD_PERMUTATION (node).release ();
!       return true;
      }
  
    /* In basic block vectorization we allow any subchain of an interleaving
--- 1373,1386 ----
    stmt = SLP_TREE_SCALAR_STMTS (node)[0];
  
    /* Reduction (there are no data-refs in the root).
!      In reduction chain the order of the loads is not important.  */
    if (!STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))
        && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
      {
!       if (vect_attempt_slp_rearrange_stmts (slp_instn))
!       return true;
  
!       /* Fallthru to general load permutation handling.  */
      }
  
    /* In basic block vectorization we allow any subchain of an interleaving
Index: gcc/tree-vect-stmts.c
===================================================================
*** gcc/tree-vect-stmts.c       (revision 224271)
--- gcc/tree-vect-stmts.c       (working copy)
*************** vectorizable_load (gimple stmt, gimple_s
*** 5995,6003 ****
        if ((grouped_load
           && (slp || PURE_SLP_STMT (stmt_info)))
          && (group_size > nunits
!             || nunits % group_size != 0
!             /* We don't support load permutations.  */
!             || slp_perm))
        {
          dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
                           "unhandled strided group load\n");
--- 5995,6001 ----
        if ((grouped_load
           && (slp || PURE_SLP_STMT (stmt_info)))
          && (group_size > nunits
!             || nunits % group_size != 0))
        {
          dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
                           "unhandled strided group load\n");
*************** vectorizable_load (gimple stmt, gimple_s
*** 6294,6299 ****
--- 6292,6298 ----
        alias_off = build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0);
        int nloads = nunits;
        tree ltype = TREE_TYPE (vectype);
+       auto_vec<tree> dr_chain;
        if (slp)
        {
          nloads = nunits / group_size;
*************** vectorizable_load (gimple stmt, gimple_s
*** 6303,6309 ****
            ltype = vectype;
          ltype = build_aligned_type (ltype, TYPE_ALIGN (TREE_TYPE (vectype)));
          ncopies = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
!         gcc_assert (!slp_perm);
        }
        for (j = 0; j < ncopies; j++)
        {
--- 6302,6309 ----
            ltype = vectype;
          ltype = build_aligned_type (ltype, TYPE_ALIGN (TREE_TYPE (vectype)));
          ncopies = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
!         if (slp_perm)
!           dr_chain.create (ncopies);
        }
        for (j = 0; j < ncopies; j++)
        {
*************** vectorizable_load (gimple stmt, gimple_s
*** 6350,6362 ****
            }
  
          if (slp)
!           SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
          if (j == 0)
            STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
          else
            STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
          prev_stmt_info = vinfo_for_stmt (new_stmt);
        }
        return true;
      }
  
--- 6350,6369 ----
            }
  
          if (slp)
!           {
!             SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
!             if (slp_perm)
!               dr_chain.quick_push (gimple_assign_lhs (new_stmt));
!           }
          if (j == 0)
            STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
          else
            STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
          prev_stmt_info = vinfo_for_stmt (new_stmt);
        }
+       if (slp_perm)
+       vect_transform_slp_perm_load (slp_node, dr_chain, gsi, vf,
+                                     slp_node_instance, false);
        return true;
      }
  

Reply via email to