The following fixes PR68892, the BB vectorizer now happily creates
a load of dead vector loads (we had a similar bug with loop
single-element interleaving support in the past).  Fixed as a side-effect
of making the SLP load cost reflect reality.

Bootstrapped on x86_64-unknown-linux-gnu, testing in progress.

Richard.

2015-12-14  Richard Biener  <rguent...@suse.de>

        PR tree-optimization/68892
        * tree-vect-slp.c (vect_analyze_slp_cost_1): Properly compute
        cost for permuted loads.

        * gcc.dg/vect/bb-slp-pr68892.c: New testcase.

Index: gcc/tree-vect-slp.c
===================================================================
*** gcc/tree-vect-slp.c (revision 231617)
--- gcc/tree-vect-slp.c (working copy)
*************** vect_analyze_slp_cost_1 (slp_instance in
*** 1405,1414 ****
  {
    unsigned i;
    slp_tree child;
!   gimple *stmt, *s;
    stmt_vec_info stmt_info;
    tree lhs;
-   unsigned group_size = SLP_INSTANCE_GROUP_SIZE (instance);
  
    /* Recurse down the SLP tree.  */
    FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
--- 1405,1413 ----
  {
    unsigned i;
    slp_tree child;
!   gimple *stmt;
    stmt_vec_info stmt_info;
    tree lhs;
  
    /* Recurse down the SLP tree.  */
    FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
*************** vect_analyze_slp_cost_1 (slp_instance in
*** 1427,1470 ****
                               node, prologue_cost_vec, body_cost_vec);
        else
        {
-         int i;
          gcc_checking_assert (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)));
-         /* If the load is permuted then the alignment is determined by
-            the first group element not by the first scalar stmt DR.  */
          if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
            {
              stmt = GROUP_FIRST_ELEMENT (stmt_info);
              stmt_info = vinfo_for_stmt (stmt);
            }
          vect_model_load_cost (stmt_info, ncopies_for_cost, false,
                                node, prologue_cost_vec, body_cost_vec);
-         /* If the load is permuted record the cost for the permutation.
-            ???  Loads from multiple chains are let through here only
-            for a single special case involving complex numbers where
-            in the end no permutation is necessary.  */
-         FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, s)
-           if ((STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo_for_stmt (s))
-                == STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info))
-               && vect_get_place_in_interleaving_chain
-                    (s, STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info)) != i)
-             {
-               record_stmt_cost (body_cost_vec, group_size, vec_perm,
-                                 stmt_info, 0, vect_body);
-               break;
-             }
        }
      }
!   else
      {
        record_stmt_cost (body_cost_vec, ncopies_for_cost, vector_stmt,
                        stmt_info, 0, vect_body);
!       if (SLP_TREE_TWO_OPERATORS (node))
!       {
!         record_stmt_cost (body_cost_vec, ncopies_for_cost, vector_stmt,
!                           stmt_info, 0, vect_body);
!         record_stmt_cost (body_cost_vec, ncopies_for_cost, vec_perm,
!                           stmt_info, 0, vect_body);
!       }
      }
  
    /* Scan operands and account for prologue cost of constants/externals.
--- 1426,1464 ----
                               node, prologue_cost_vec, body_cost_vec);
        else
        {
          gcc_checking_assert (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)));
          if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
            {
+             /* If the load is permuted then the alignment is determined by
+                the first group element not by the first scalar stmt DR.  */
              stmt = GROUP_FIRST_ELEMENT (stmt_info);
              stmt_info = vinfo_for_stmt (stmt);
+             /* Record the cost for the permutation.  */
+             record_stmt_cost (body_cost_vec, ncopies_for_cost, vec_perm,
+                               stmt_info, 0, vect_body);
+             /* And adjust the number of loads performed.  */
+             unsigned nunits
+               = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
+             ncopies_for_cost
+               = (GROUP_SIZE (stmt_info) - GROUP_GAP (stmt_info)
+                  + nunits - 1) / nunits;
+             ncopies_for_cost *= SLP_INSTANCE_UNROLLING_FACTOR (instance);
            }
+         /* Record the cost for the vector loads.  */
          vect_model_load_cost (stmt_info, ncopies_for_cost, false,
                                node, prologue_cost_vec, body_cost_vec);
        }
+       return;
      }
! 
!   record_stmt_cost (body_cost_vec, ncopies_for_cost, vector_stmt,
!                   stmt_info, 0, vect_body);
!   if (SLP_TREE_TWO_OPERATORS (node))
      {
        record_stmt_cost (body_cost_vec, ncopies_for_cost, vector_stmt,
                        stmt_info, 0, vect_body);
!       record_stmt_cost (body_cost_vec, ncopies_for_cost, vec_perm,
!                       stmt_info, 0, vect_body);
      }
  
    /* Scan operands and account for prologue cost of constants/externals.
Index: gcc/testsuite/gcc.dg/vect/bb-slp-pr68892.c
===================================================================
*** gcc/testsuite/gcc.dg/vect/bb-slp-pr68892.c  (revision 0)
--- gcc/testsuite/gcc.dg/vect/bb-slp-pr68892.c  (revision 0)
***************
*** 0 ****
--- 1,16 ----
+ /* { dg-do compile } */
+ /* { dg-additional-options "-fvect-cost-model=dynamic" } */
+ 
+ double a[128][128];
+ double b[128];
+ 
+ void foo(void)
+ {
+   b[0] = a[0][0];
+   b[1] = a[1][0];
+   b[2] = a[2][0];
+   b[3] = a[3][0];
+ }
+ 
+ /* { dg-final { scan-tree-dump "not profitable" "slp2" } } */
+ /* { dg-final { scan-tree-dump-times "Basic block will be vectorized" 0 
"slp2" } } */

Reply via email to