I noticed you cannot enable regular loop distribution at -O3 unless you disable pattern recognition. This lead me to simplifying the code, changing semantics of the flags slightly (but still compatible with what I read into their documentation). -floop-distribution enables loop distribution (but not pattern recognition) and -floop-distribute-patterns enables pattern recognition but not generic loop distribution. The latter is still enabled by -O3, if you enable both you get both.
The patch also removes one kind of "similar memory access" detection. I have a followup that makes the remaining one cheaper. Bootstrapped and tested on x86_64-unknown-linux-gnu, applied. Richard. 2012-06-04 Richard Guenther <rguent...@suse.de> * tree-data-ref.c (have_similar_memory_accesses_1): Remove. (ref_base_address_1): Likewise. (remove_similar_memory_refs): Likewise. * tree-data-ref.h (remove_similar_memory_refs): Remove. * tree-loop-distribution.c (classify_partition): Do not classify as builtin if -ftree-loop-distribute-patterns is not enabled. (fuse_partitions_with_similar_memory_accesses): Inline ... (ldist_gen): ... here. Fuse all non-builtin partitions if -ftree-loop-distribution is not enabled. Properly return the number of created partitions. Do not update SSA form here but ... (tree_loop_distribution): ... once here for the whole function. Only walk innermost loops, constrain loops we consider here further. Do not call remove_similar_memory_refs. (distribute_loop): Do not check number of loop nodes here. * gcc.dg/tree-ssa/ldist-11.c: Enable -ftree-loop-distribute-patterns. * gcc.dg/tree-ssa/ldist-17.c: Likewise. * gcc.dg/tree-ssa/ldist-pr45948.c: Likewise. Index: gcc/tree-data-ref.c =================================================================== *** gcc/tree-data-ref.c.orig 2012-06-01 14:21:36.000000000 +0200 --- gcc/tree-data-ref.c 2012-06-01 15:26:42.826591786 +0200 *************** have_similar_memory_accesses (gimple s1, *** 5403,5467 **** VEC_free (data_ref_loc, heap, refs2); return res; } - - /* Helper function for the hashtab. */ - - static int - have_similar_memory_accesses_1 (const void *s1, const void *s2) - { - return have_similar_memory_accesses (CONST_CAST_GIMPLE ((const_gimple) s1), - CONST_CAST_GIMPLE ((const_gimple) s2)); - } - - /* Helper function for the hashtab. */ - - static hashval_t - ref_base_address_1 (const void *s) - { - gimple stmt = CONST_CAST_GIMPLE ((const_gimple) s); - unsigned i; - VEC (data_ref_loc, heap) *refs; - data_ref_loc *ref; - hashval_t res = 0; - - get_references_in_stmt (stmt, &refs); - - FOR_EACH_VEC_ELT (data_ref_loc, refs, i, ref) - if (!ref->is_read) - { - res = htab_hash_pointer (ref_base_address (stmt, ref)); - break; - } - - VEC_free (data_ref_loc, heap, refs); - return res; - } - - /* Try to remove duplicated write data references from STMTS. */ - - void - remove_similar_memory_refs (VEC (gimple, heap) **stmts) - { - unsigned i; - gimple stmt; - htab_t seen = htab_create (VEC_length (gimple, *stmts), ref_base_address_1, - have_similar_memory_accesses_1, NULL); - - for (i = 0; VEC_iterate (gimple, *stmts, i, stmt); ) - { - void **slot; - - slot = htab_find_slot (seen, stmt, INSERT); - - if (*slot) - VEC_ordered_remove (gimple, *stmts, i); - else - { - *slot = (void *) stmt; - i++; - } - } - - htab_delete (seen); - } - --- 5403,5405 ---- Index: gcc/tree-data-ref.h =================================================================== *** gcc/tree-data-ref.h.orig 2012-06-01 14:21:36.000000000 +0200 --- gcc/tree-data-ref.h 2012-06-01 15:26:42.826591786 +0200 *************** index_in_loop_nest (int var, VEC (loop_p *** 607,613 **** void stores_from_loop (struct loop *, VEC (gimple, heap) **); void stores_zero_from_loop (struct loop *, VEC (gimple, heap) **); - void remove_similar_memory_refs (VEC (gimple, heap) **); bool rdg_defs_used_in_other_loops_p (struct graph *, int); bool have_similar_memory_accesses (gimple, gimple); bool stmt_with_adjacent_zero_store_dr_p (gimple); --- 607,612 ---- Index: gcc/tree-loop-distribution.c =================================================================== *** gcc/tree-loop-distribution.c.orig 2012-06-01 14:21:36.000000000 +0200 --- gcc/tree-loop-distribution.c 2012-06-01 15:45:22.848553012 +0200 *************** destroy_loop (struct loop *loop) *** 398,404 **** rescan_loop_exit (exit, false, true); for (i = 0; i < nbbs; i++) ! delete_basic_block (bbs[i]); free (bbs); set_immediate_dominator (CDI_DOMINATORS, dest, --- 398,425 ---- rescan_loop_exit (exit, false, true); for (i = 0; i < nbbs; i++) ! { ! /* We have made sure to not leave any dangling uses of SSA ! names defined in the loop. With the exception of virtuals. ! Make sure we replace all uses of virtual defs that will remain ! outside of the loop with the bare symbol as delete_basic_block ! will release them. */ ! gimple_stmt_iterator gsi; ! for (gsi = gsi_start_phis (bbs[i]); !gsi_end_p (gsi); gsi_next (&gsi)) ! { ! gimple phi = gsi_stmt (gsi); ! if (!is_gimple_reg (gimple_phi_result (phi))) ! mark_virtual_phi_result_for_renaming (phi); ! } ! for (gsi = gsi_start_bb (bbs[i]); !gsi_end_p (gsi); gsi_next (&gsi)) ! { ! gimple stmt = gsi_stmt (gsi); ! tree vdef = gimple_vdef (stmt); ! if (vdef && TREE_CODE (vdef) == SSA_NAME) ! mark_virtual_operand_for_renaming (vdef); ! } ! delete_basic_block (bbs[i]); ! } free (bbs); set_immediate_dominator (CDI_DOMINATORS, dest, *************** classify_partition (loop_p loop, struct *** 801,806 **** --- 822,830 ---- partition->kind = PKIND_NORMAL; partition->main_stmt = NULL; + if (!flag_tree_loop_distribute_patterns) + return; + /* Perform general partition disqualification for builtins. */ nb_iter = number_of_exit_cond_executions (loop); if (!nb_iter || nb_iter == chrec_dont_know) *************** similar_memory_accesses (struct graph *r *** 876,906 **** return false; } - /* Fuse all the partitions from PARTITIONS that contain similar memory - references, i.e., we're taking care of cache locality. This - function does not fuse those partitions that contain patterns that - can be code generated with builtins. */ - - static void - fuse_partitions_with_similar_memory_accesses (struct graph *rdg, - VEC (partition_t, heap) **partitions) - { - int p1, p2; - partition_t partition1, partition2; - - FOR_EACH_VEC_ELT (partition_t, *partitions, p1, partition1) - if (!partition_builtin_p (partition1)) - FOR_EACH_VEC_ELT (partition_t, *partitions, p2, partition2) - if (p1 != p2 - && !partition_builtin_p (partition2) - && similar_memory_accesses (rdg, partition1, partition2)) - { - bitmap_ior_into (partition1->stmts, partition2->stmts); - VEC_ordered_remove (partition_t, *partitions, p2); - p2--; - } - } - /* Aggregate several components into a useful partition that is registered in the PARTITIONS vector. Partitions will be distributed in different loops. */ --- 900,905 ---- *************** ldist_gen (struct loop *loop, struct gra *** 1100,1106 **** FOR_EACH_VEC_ELT (partition_t, partitions, i, partition) classify_partition (loop, rdg, partition); ! fuse_partitions_with_similar_memory_accesses (rdg, &partitions); nbp = VEC_length (partition_t, partitions); if (nbp == 0 --- 1099,1153 ---- FOR_EACH_VEC_ELT (partition_t, partitions, i, partition) classify_partition (loop, rdg, partition); ! /* If we are only distributing patterns fuse all partitions that ! were not properly classified as builtins. Else fuse partitions ! with similar memory accesses. */ ! if (!flag_tree_loop_distribution) ! { ! partition_t into; ! for (i = 0; VEC_iterate (partition_t, partitions, i, into); ++i) ! if (!partition_builtin_p (into)) ! break; ! for (++i; VEC_iterate (partition_t, partitions, i, partition); ++i) ! if (!partition_builtin_p (partition)) ! { ! bitmap_ior_into (into->stmts, partition->stmts); ! VEC_ordered_remove (partition_t, partitions, i); ! i--; ! } ! } ! else ! { ! partition_t into; ! int j; ! for (i = 0; VEC_iterate (partition_t, partitions, i, into); ++i) ! { ! if (partition_builtin_p (into)) ! continue; ! for (j = i + 1; ! VEC_iterate (partition_t, partitions, j, partition); ++j) ! { ! if (!partition_builtin_p (partition) ! /* ??? The following is horribly inefficient, ! we are re-computing and analyzing data-references ! of the stmts in the partitions all the time. */ ! && similar_memory_accesses (rdg, into, partition)) ! { ! if (dump_file && (dump_flags & TDF_DETAILS)) ! { ! fprintf (dump_file, "fusing partitions\n"); ! dump_bitmap (dump_file, into->stmts); ! dump_bitmap (dump_file, partition->stmts); ! fprintf (dump_file, "because they have similar " ! "memory accesses\n"); ! } ! bitmap_ior_into (into->stmts, partition->stmts); ! VEC_ordered_remove (partition_t, partitions, j); ! j--; ! } ! } ! } ! } nbp = VEC_length (partition_t, partitions); if (nbp == 0 *************** ldist_gen (struct loop *loop, struct gra *** 1108,1114 **** && !partition_builtin_p (VEC_index (partition_t, partitions, 0))) || (nbp > 1 && partition_contains_all_rw (rdg, partitions))) ! goto ldist_done; if (dump_file && (dump_flags & TDF_DETAILS)) dump_rdg_partitions (dump_file, partitions); --- 1155,1164 ---- && !partition_builtin_p (VEC_index (partition_t, partitions, 0))) || (nbp > 1 && partition_contains_all_rw (rdg, partitions))) ! { ! nbp = 0; ! goto ldist_done; ! } if (dump_file && (dump_flags & TDF_DETAILS)) dump_rdg_partitions (dump_file, partitions); *************** ldist_gen (struct loop *loop, struct gra *** 1116,1125 **** FOR_EACH_VEC_ELT (partition_t, partitions, i, partition) generate_code_for_partition (loop, partition, i < nbp - 1); - rewrite_into_loop_closed_ssa (NULL, TODO_update_ssa); - mark_sym_for_renaming (gimple_vop (cfun)); - update_ssa (TODO_update_ssa_only_virtuals); - ldist_done: BITMAP_FREE (remaining_stmts); --- 1166,1171 ---- *************** distribute_loop (struct loop *loop, VEC *** 1152,1167 **** VEC (data_reference_p, heap) *datarefs; VEC (loop_p, heap) *loop_nest; - if (loop->num_nodes > 2) - { - if (dump_file && (dump_flags & TDF_DETAILS)) - fprintf (dump_file, - "FIXME: Loop %d not distributed: it has more than two basic blocks.\n", - loop->num); - - return res; - } - datarefs = VEC_alloc (data_reference_p, heap, 10); dependence_relations = VEC_alloc (ddr_p, heap, 100); loop_nest = VEC_alloc (loop_p, heap, 3); --- 1198,1203 ---- *************** tree_loop_distribution (void) *** 1215,1262 **** { struct loop *loop; loop_iterator li; ! int nb_generated_loops = 0; ! FOR_EACH_LOOP (li, loop, 0) { VEC (gimple, heap) *work_list = NULL; int num = loop->num; /* If the loop doesn't have a single exit we will fail anyway, so do that early. */ if (!single_exit (loop)) continue; ! /* If both flag_tree_loop_distribute_patterns and ! flag_tree_loop_distribution are set, then only ! distribute_patterns is executed. */ ! if (flag_tree_loop_distribute_patterns) ! { ! /* With the following working list, we're asking ! distribute_loop to separate from the rest of the loop the ! stores of the form "A[i] = 0". */ ! stores_zero_from_loop (loop, &work_list); ! ! /* Do nothing if there are no patterns to be distributed. */ ! if (VEC_length (gimple, work_list) > 0) ! nb_generated_loops = distribute_loop (loop, work_list); ! } ! else if (flag_tree_loop_distribution) ! { ! /* With the following working list, we're asking ! distribute_loop to separate the stores of the loop: when ! dependences allow, it will end on having one store per ! loop. */ ! stores_from_loop (loop, &work_list); ! ! /* A simple heuristic for cache locality is to not split ! stores to the same array. Without this call, an unrolled ! loop would be split into as many loops as unroll factor, ! each loop storing in the same array. */ ! remove_similar_memory_refs (&work_list); ! nb_generated_loops = distribute_loop (loop, work_list); ! } if (dump_file && (dump_flags & TDF_DETAILS)) { --- 1251,1288 ---- { struct loop *loop; loop_iterator li; ! bool changed = false; ! /* We can at the moment only distribute non-nested loops, thus restrict ! walking to innermost loops. */ ! FOR_EACH_LOOP (li, loop, LI_ONLY_INNERMOST) { VEC (gimple, heap) *work_list = NULL; int num = loop->num; + int nb_generated_loops = 0; /* If the loop doesn't have a single exit we will fail anyway, so do that early. */ if (!single_exit (loop)) continue; ! /* Only distribute loops with a header and latch for now. */ ! if (loop->num_nodes > 2) ! continue; ! /* -ftree-loop-distribution strictly distributes more but also ! enables pattern detection. For now simply distribute all stores ! or memset like stores. */ ! if (flag_tree_loop_distribution) ! stores_from_loop (loop, &work_list); ! else if (flag_tree_loop_distribute_patterns) ! stores_zero_from_loop (loop, &work_list); ! ! if (VEC_length (gimple, work_list) > 0) ! nb_generated_loops = distribute_loop (loop, work_list); ! ! if (nb_generated_loops > 0) ! changed = true; if (dump_file && (dump_flags & TDF_DETAILS)) { *************** tree_loop_distribution (void) *** 1267,1279 **** fprintf (dump_file, "Loop %d is the same.\n", num); } - #ifdef ENABLE_CHECKING - verify_loop_structure (); - #endif - VEC_free (gimple, heap, work_list); } return 0; } --- 1293,1311 ---- fprintf (dump_file, "Loop %d is the same.\n", num); } VEC_free (gimple, heap, work_list); } + if (changed) + { + mark_sym_for_renaming (gimple_vop (cfun)); + rewrite_into_loop_closed_ssa (NULL, TODO_update_ssa); + } + + #ifdef ENABLE_CHECKING + verify_loop_structure (); + #endif + return 0; } Index: gcc/testsuite/gcc.dg/tree-ssa/ldist-11.c =================================================================== *** gcc/testsuite/gcc.dg/tree-ssa/ldist-11.c.orig 2012-06-01 14:21:36.000000000 +0200 --- gcc/testsuite/gcc.dg/tree-ssa/ldist-11.c 2012-06-01 14:21:56.413726345 +0200 *************** *** 1,5 **** /* { dg-do compile } */ ! /* { dg-options "-O2 -ftree-loop-distribution -fdump-tree-ldist-all" } */ void foo (int * __restrict__ ia, int * __restrict__ ib, --- 1,5 ---- /* { dg-do compile } */ ! /* { dg-options "-O2 -ftree-loop-distribution -ftree-loop-distribute-patterns -fdump-tree-ldist-all" } */ void foo (int * __restrict__ ia, int * __restrict__ ib, Index: gcc/testsuite/gcc.dg/tree-ssa/ldist-17.c =================================================================== *** gcc/testsuite/gcc.dg/tree-ssa/ldist-17.c.orig 2012-06-01 14:21:36.000000000 +0200 --- gcc/testsuite/gcc.dg/tree-ssa/ldist-17.c 2012-06-01 14:21:56.413726345 +0200 *************** *** 1,5 **** /* { dg-do compile } */ ! /* { dg-options "-O2 -ftree-loop-distribution -fdump-tree-ldist-details" } */ typedef int mad_fixed_t; struct mad_pcm --- 1,5 ---- /* { dg-do compile } */ ! /* { dg-options "-O2 -ftree-loop-distribution -ftree-loop-distribute-patterns -fdump-tree-ldist-details" } */ typedef int mad_fixed_t; struct mad_pcm Index: gcc/testsuite/gcc.dg/tree-ssa/ldist-pr45948.c =================================================================== *** gcc/testsuite/gcc.dg/tree-ssa/ldist-pr45948.c.orig 2012-06-01 14:21:36.000000000 +0200 --- gcc/testsuite/gcc.dg/tree-ssa/ldist-pr45948.c 2012-06-01 14:21:56.413726345 +0200 *************** *** 1,5 **** /* { dg-do compile } */ ! /* { dg-options "-O2 -ftree-loop-distribution -fdump-tree-ldist-details" } */ extern void bar(int); --- 1,5 ---- /* { dg-do compile } */ ! /* { dg-options "-O2 -ftree-loop-distribution -ftree-loop-distribute-patterns -fdump-tree-ldist-details" } */ extern void bar(int);