I noticed you cannot enable regular loop distribution at -O3 unless
you disable pattern recognition.  This lead me to simplifying the
code, changing semantics of the flags slightly (but still compatible
with what I read into their documentation).  -floop-distribution
enables loop distribution (but not pattern recognition) and
-floop-distribute-patterns enables pattern recognition but not
generic loop distribution.  The latter is still enabled by -O3,
if you enable both you get both.

The patch also removes one kind of "similar memory access" detection.
I have a followup that makes the remaining one cheaper.

Bootstrapped and tested on x86_64-unknown-linux-gnu, applied.

Richard.

2012-06-04  Richard Guenther  <rguent...@suse.de>

        * tree-data-ref.c (have_similar_memory_accesses_1): Remove.
        (ref_base_address_1): Likewise.
        (remove_similar_memory_refs): Likewise.
        * tree-data-ref.h (remove_similar_memory_refs): Remove.
        * tree-loop-distribution.c (classify_partition): Do not classify
        as builtin if -ftree-loop-distribute-patterns is not enabled.
        (fuse_partitions_with_similar_memory_accesses): Inline ...
        (ldist_gen): ... here.  Fuse all non-builtin partitions if
        -ftree-loop-distribution is not enabled.  Properly return
        the number of created partitions.  Do not update SSA form here
        but ...
        (tree_loop_distribution): ... once here for the whole function.
        Only walk innermost loops, constrain loops we consider here
        further.  Do not call remove_similar_memory_refs.
        (distribute_loop): Do not check number of loop nodes here.

        * gcc.dg/tree-ssa/ldist-11.c: Enable -ftree-loop-distribute-patterns.
        * gcc.dg/tree-ssa/ldist-17.c: Likewise.
        * gcc.dg/tree-ssa/ldist-pr45948.c: Likewise.

Index: gcc/tree-data-ref.c
===================================================================
*** gcc/tree-data-ref.c.orig    2012-06-01 14:21:36.000000000 +0200
--- gcc/tree-data-ref.c 2012-06-01 15:26:42.826591786 +0200
*************** have_similar_memory_accesses (gimple s1,
*** 5403,5467 ****
    VEC_free (data_ref_loc, heap, refs2);
    return res;
  }
- 
- /* Helper function for the hashtab.  */
- 
- static int
- have_similar_memory_accesses_1 (const void *s1, const void *s2)
- {
-   return have_similar_memory_accesses (CONST_CAST_GIMPLE ((const_gimple) s1),
-                                      CONST_CAST_GIMPLE ((const_gimple) s2));
- }
- 
- /* Helper function for the hashtab.  */
- 
- static hashval_t
- ref_base_address_1 (const void *s)
- {
-   gimple stmt = CONST_CAST_GIMPLE ((const_gimple) s);
-   unsigned i;
-   VEC (data_ref_loc, heap) *refs;
-   data_ref_loc *ref;
-   hashval_t res = 0;
- 
-   get_references_in_stmt (stmt, &refs);
- 
-   FOR_EACH_VEC_ELT (data_ref_loc, refs, i, ref)
-     if (!ref->is_read)
-       {
-       res = htab_hash_pointer (ref_base_address (stmt, ref));
-       break;
-       }
- 
-   VEC_free (data_ref_loc, heap, refs);
-   return res;
- }
- 
- /* Try to remove duplicated write data references from STMTS.  */
- 
- void
- remove_similar_memory_refs (VEC (gimple, heap) **stmts)
- {
-   unsigned i;
-   gimple stmt;
-   htab_t seen = htab_create (VEC_length (gimple, *stmts), ref_base_address_1,
-                            have_similar_memory_accesses_1, NULL);
- 
-   for (i = 0; VEC_iterate (gimple, *stmts, i, stmt); )
-     {
-       void **slot;
- 
-       slot = htab_find_slot (seen, stmt, INSERT);
- 
-       if (*slot)
-       VEC_ordered_remove (gimple, *stmts, i);
-       else
-       {
-         *slot = (void *) stmt;
-         i++;
-       }
-     }
- 
-   htab_delete (seen);
- }
- 
--- 5403,5405 ----
Index: gcc/tree-data-ref.h
===================================================================
*** gcc/tree-data-ref.h.orig    2012-06-01 14:21:36.000000000 +0200
--- gcc/tree-data-ref.h 2012-06-01 15:26:42.826591786 +0200
*************** index_in_loop_nest (int var, VEC (loop_p
*** 607,613 ****
  
  void stores_from_loop (struct loop *, VEC (gimple, heap) **);
  void stores_zero_from_loop (struct loop *, VEC (gimple, heap) **);
- void remove_similar_memory_refs (VEC (gimple, heap) **);
  bool rdg_defs_used_in_other_loops_p (struct graph *, int);
  bool have_similar_memory_accesses (gimple, gimple);
  bool stmt_with_adjacent_zero_store_dr_p (gimple);
--- 607,612 ----
Index: gcc/tree-loop-distribution.c
===================================================================
*** gcc/tree-loop-distribution.c.orig   2012-06-01 14:21:36.000000000 +0200
--- gcc/tree-loop-distribution.c        2012-06-01 15:45:22.848553012 +0200
*************** destroy_loop (struct loop *loop)
*** 398,404 ****
    rescan_loop_exit (exit, false, true);
  
    for (i = 0; i < nbbs; i++)
!     delete_basic_block (bbs[i]);
    free (bbs);
  
    set_immediate_dominator (CDI_DOMINATORS, dest,
--- 398,425 ----
    rescan_loop_exit (exit, false, true);
  
    for (i = 0; i < nbbs; i++)
!     {
!       /* We have made sure to not leave any dangling uses of SSA
!          names defined in the loop.  With the exception of virtuals.
!        Make sure we replace all uses of virtual defs that will remain
!        outside of the loop with the bare symbol as delete_basic_block
!        will release them.  */
!       gimple_stmt_iterator gsi;
!       for (gsi = gsi_start_phis (bbs[i]); !gsi_end_p (gsi); gsi_next (&gsi))
!       {
!         gimple phi = gsi_stmt (gsi);
!         if (!is_gimple_reg (gimple_phi_result (phi)))
!           mark_virtual_phi_result_for_renaming (phi);
!       }
!       for (gsi = gsi_start_bb (bbs[i]); !gsi_end_p (gsi); gsi_next (&gsi))
!       {
!         gimple stmt = gsi_stmt (gsi);
!         tree vdef = gimple_vdef (stmt);
!         if (vdef && TREE_CODE (vdef) == SSA_NAME)
!           mark_virtual_operand_for_renaming (vdef);
!       }
!       delete_basic_block (bbs[i]);
!     }
    free (bbs);
  
    set_immediate_dominator (CDI_DOMINATORS, dest,
*************** classify_partition (loop_p loop, struct
*** 801,806 ****
--- 822,830 ----
    partition->kind = PKIND_NORMAL;
    partition->main_stmt = NULL;
  
+   if (!flag_tree_loop_distribute_patterns)
+     return;
+ 
    /* Perform general partition disqualification for builtins.  */
    nb_iter = number_of_exit_cond_executions (loop);
    if (!nb_iter || nb_iter == chrec_dont_know)
*************** similar_memory_accesses (struct graph *r
*** 876,906 ****
    return false;
  }
  
- /* Fuse all the partitions from PARTITIONS that contain similar memory
-    references, i.e., we're taking care of cache locality.  This
-    function does not fuse those partitions that contain patterns that
-    can be code generated with builtins.  */
- 
- static void
- fuse_partitions_with_similar_memory_accesses (struct graph *rdg,
-                                             VEC (partition_t, heap) 
**partitions)
- {
-   int p1, p2;
-   partition_t partition1, partition2;
- 
-   FOR_EACH_VEC_ELT (partition_t, *partitions, p1, partition1)
-     if (!partition_builtin_p (partition1))
-       FOR_EACH_VEC_ELT (partition_t, *partitions, p2, partition2)
-       if (p1 != p2
-           && !partition_builtin_p (partition2)
-           && similar_memory_accesses (rdg, partition1, partition2))
-         {
-           bitmap_ior_into (partition1->stmts, partition2->stmts);
-           VEC_ordered_remove (partition_t, *partitions, p2);
-           p2--;
-         }
- }
- 
  /* Aggregate several components into a useful partition that is
     registered in the PARTITIONS vector.  Partitions will be
     distributed in different loops.  */
--- 900,905 ----
*************** ldist_gen (struct loop *loop, struct gra
*** 1100,1106 ****
    FOR_EACH_VEC_ELT (partition_t, partitions, i, partition)
      classify_partition (loop, rdg, partition);
  
!   fuse_partitions_with_similar_memory_accesses (rdg, &partitions);
  
    nbp = VEC_length (partition_t, partitions);
    if (nbp == 0
--- 1099,1153 ----
    FOR_EACH_VEC_ELT (partition_t, partitions, i, partition)
      classify_partition (loop, rdg, partition);
  
!   /* If we are only distributing patterns fuse all partitions that
!      were not properly classified as builtins.  Else fuse partitions
!      with similar memory accesses.  */
!   if (!flag_tree_loop_distribution)
!     {
!       partition_t into;
!       for (i = 0; VEC_iterate (partition_t, partitions, i, into); ++i)
!       if (!partition_builtin_p (into))
!         break;
!       for (++i; VEC_iterate (partition_t, partitions, i, partition); ++i)
!       if (!partition_builtin_p (partition))
!         {
!           bitmap_ior_into (into->stmts, partition->stmts);
!           VEC_ordered_remove (partition_t, partitions, i);
!           i--;
!         }
!     }
!   else
!     {
!       partition_t into;
!       int j;
!       for (i = 0; VEC_iterate (partition_t, partitions, i, into); ++i)
!       {
!         if (partition_builtin_p (into))
!           continue;
!         for (j = i + 1;
!              VEC_iterate (partition_t, partitions, j, partition); ++j)
!           {
!             if (!partition_builtin_p (partition)
!                 /* ???  The following is horribly inefficient,
!                    we are re-computing and analyzing data-references
!                    of the stmts in the partitions all the time.  */
!                 && similar_memory_accesses (rdg, into, partition))
!               {
!                 if (dump_file && (dump_flags & TDF_DETAILS))
!                   {
!                     fprintf (dump_file, "fusing partitions\n");
!                     dump_bitmap (dump_file, into->stmts);
!                     dump_bitmap (dump_file, partition->stmts);
!                     fprintf (dump_file, "because they have similar "
!                              "memory accesses\n");
!                   }
!                 bitmap_ior_into (into->stmts, partition->stmts);
!                 VEC_ordered_remove (partition_t, partitions, j);
!                 j--;
!               }
!           }
!       }
!     }
  
    nbp = VEC_length (partition_t, partitions);
    if (nbp == 0
*************** ldist_gen (struct loop *loop, struct gra
*** 1108,1114 ****
          && !partition_builtin_p (VEC_index (partition_t, partitions, 0)))
        || (nbp > 1
          && partition_contains_all_rw (rdg, partitions)))
!     goto ldist_done;
  
    if (dump_file && (dump_flags & TDF_DETAILS))
      dump_rdg_partitions (dump_file, partitions);
--- 1155,1164 ----
          && !partition_builtin_p (VEC_index (partition_t, partitions, 0)))
        || (nbp > 1
          && partition_contains_all_rw (rdg, partitions)))
!     {
!       nbp = 0;
!       goto ldist_done;
!     }
  
    if (dump_file && (dump_flags & TDF_DETAILS))
      dump_rdg_partitions (dump_file, partitions);
*************** ldist_gen (struct loop *loop, struct gra
*** 1116,1125 ****
    FOR_EACH_VEC_ELT (partition_t, partitions, i, partition)
      generate_code_for_partition (loop, partition, i < nbp - 1);
  
-   rewrite_into_loop_closed_ssa (NULL, TODO_update_ssa);
-   mark_sym_for_renaming (gimple_vop (cfun));
-   update_ssa (TODO_update_ssa_only_virtuals);
- 
   ldist_done:
  
    BITMAP_FREE (remaining_stmts);
--- 1166,1171 ----
*************** distribute_loop (struct loop *loop, VEC
*** 1152,1167 ****
    VEC (data_reference_p, heap) *datarefs;
    VEC (loop_p, heap) *loop_nest;
  
-   if (loop->num_nodes > 2)
-     {
-       if (dump_file && (dump_flags & TDF_DETAILS))
-       fprintf (dump_file,
-                "FIXME: Loop %d not distributed: it has more than two basic 
blocks.\n",
-                loop->num);
- 
-       return res;
-     }
- 
    datarefs = VEC_alloc (data_reference_p, heap, 10);
    dependence_relations = VEC_alloc (ddr_p, heap, 100);
    loop_nest = VEC_alloc (loop_p, heap, 3);
--- 1198,1203 ----
*************** tree_loop_distribution (void)
*** 1215,1262 ****
  {
    struct loop *loop;
    loop_iterator li;
!   int nb_generated_loops = 0;
  
!   FOR_EACH_LOOP (li, loop, 0)
      {
        VEC (gimple, heap) *work_list = NULL;
        int num = loop->num;
  
        /* If the loop doesn't have a single exit we will fail anyway,
         so do that early.  */
        if (!single_exit (loop))
        continue;
  
!       /* If both flag_tree_loop_distribute_patterns and
!        flag_tree_loop_distribution are set, then only
!        distribute_patterns is executed.  */
!       if (flag_tree_loop_distribute_patterns)
!       {
!         /* With the following working list, we're asking
!            distribute_loop to separate from the rest of the loop the
!            stores of the form "A[i] = 0".  */
!         stores_zero_from_loop (loop, &work_list);
! 
!         /* Do nothing if there are no patterns to be distributed.  */
!         if (VEC_length (gimple, work_list) > 0)
!           nb_generated_loops = distribute_loop (loop, work_list);
!       }
!       else if (flag_tree_loop_distribution)
!       {
!         /* With the following working list, we're asking
!            distribute_loop to separate the stores of the loop: when
!            dependences allow, it will end on having one store per
!            loop.  */
!         stores_from_loop (loop, &work_list);
! 
!         /* A simple heuristic for cache locality is to not split
!            stores to the same array.  Without this call, an unrolled
!            loop would be split into as many loops as unroll factor,
!            each loop storing in the same array.  */
!         remove_similar_memory_refs (&work_list);
  
!         nb_generated_loops = distribute_loop (loop, work_list);
!       }
  
        if (dump_file && (dump_flags & TDF_DETAILS))
        {
--- 1251,1288 ----
  {
    struct loop *loop;
    loop_iterator li;
!   bool changed = false;
  
!   /* We can at the moment only distribute non-nested loops, thus restrict
!      walking to innermost loops.  */
!   FOR_EACH_LOOP (li, loop, LI_ONLY_INNERMOST)
      {
        VEC (gimple, heap) *work_list = NULL;
        int num = loop->num;
+       int nb_generated_loops = 0;
  
        /* If the loop doesn't have a single exit we will fail anyway,
         so do that early.  */
        if (!single_exit (loop))
        continue;
  
!       /* Only distribute loops with a header and latch for now.  */
!       if (loop->num_nodes > 2)
!       continue;
  
!       /* -ftree-loop-distribution strictly distributes more but also
!          enables pattern detection.  For now simply distribute all stores
!        or memset like stores.  */
!       if (flag_tree_loop_distribution)
!       stores_from_loop (loop, &work_list);
!       else if (flag_tree_loop_distribute_patterns)
!       stores_zero_from_loop (loop, &work_list);
! 
!       if (VEC_length (gimple, work_list) > 0)
!       nb_generated_loops = distribute_loop (loop, work_list);
! 
!       if (nb_generated_loops > 0)
!       changed = true;
  
        if (dump_file && (dump_flags & TDF_DETAILS))
        {
*************** tree_loop_distribution (void)
*** 1267,1279 ****
            fprintf (dump_file, "Loop %d is the same.\n", num);
        }
  
- #ifdef ENABLE_CHECKING
-       verify_loop_structure ();
- #endif
- 
        VEC_free (gimple, heap, work_list);
      }
  
    return 0;
  }
  
--- 1293,1311 ----
            fprintf (dump_file, "Loop %d is the same.\n", num);
        }
  
        VEC_free (gimple, heap, work_list);
      }
  
+   if (changed)
+     {
+       mark_sym_for_renaming (gimple_vop (cfun));
+       rewrite_into_loop_closed_ssa (NULL, TODO_update_ssa);
+     }
+ 
+ #ifdef ENABLE_CHECKING
+   verify_loop_structure ();
+ #endif
+ 
    return 0;
  }
  
Index: gcc/testsuite/gcc.dg/tree-ssa/ldist-11.c
===================================================================
*** gcc/testsuite/gcc.dg/tree-ssa/ldist-11.c.orig       2012-06-01 
14:21:36.000000000 +0200
--- gcc/testsuite/gcc.dg/tree-ssa/ldist-11.c    2012-06-01 14:21:56.413726345 
+0200
***************
*** 1,5 ****
  /* { dg-do compile } */ 
! /* { dg-options "-O2 -ftree-loop-distribution -fdump-tree-ldist-all" } */
  
  void foo (int * __restrict__ ia,
          int * __restrict__ ib,
--- 1,5 ----
  /* { dg-do compile } */ 
! /* { dg-options "-O2 -ftree-loop-distribution -ftree-loop-distribute-patterns 
-fdump-tree-ldist-all" } */
  
  void foo (int * __restrict__ ia,
          int * __restrict__ ib,
Index: gcc/testsuite/gcc.dg/tree-ssa/ldist-17.c
===================================================================
*** gcc/testsuite/gcc.dg/tree-ssa/ldist-17.c.orig       2012-06-01 
14:21:36.000000000 +0200
--- gcc/testsuite/gcc.dg/tree-ssa/ldist-17.c    2012-06-01 14:21:56.413726345 
+0200
***************
*** 1,5 ****
  /* { dg-do compile } */
! /* { dg-options "-O2 -ftree-loop-distribution -fdump-tree-ldist-details" } */
  
  typedef int mad_fixed_t;
  struct mad_pcm
--- 1,5 ----
  /* { dg-do compile } */
! /* { dg-options "-O2 -ftree-loop-distribution -ftree-loop-distribute-patterns 
-fdump-tree-ldist-details" } */
  
  typedef int mad_fixed_t;
  struct mad_pcm
Index: gcc/testsuite/gcc.dg/tree-ssa/ldist-pr45948.c
===================================================================
*** gcc/testsuite/gcc.dg/tree-ssa/ldist-pr45948.c.orig  2012-06-01 
14:21:36.000000000 +0200
--- gcc/testsuite/gcc.dg/tree-ssa/ldist-pr45948.c       2012-06-01 
14:21:56.413726345 +0200
***************
*** 1,5 ****
  /* { dg-do compile } */
! /* { dg-options "-O2 -ftree-loop-distribution -fdump-tree-ldist-details" } */
  
  extern void bar(int);
  
--- 1,5 ----
  /* { dg-do compile } */
! /* { dg-options "-O2 -ftree-loop-distribution -ftree-loop-distribute-patterns 
-fdump-tree-ldist-details" } */
  
  extern void bar(int);
  

Reply via email to