I split out some TLC to loop distribution from a patch I'll post shortly. Bootstrapped on x86_64-unknown-linux-gnu, testing in progress.
Richard. 2013-10-02 Richard Biener <rguent...@suse.de> * tree-loop-distribution.c: Include tree-vectorizer.h for find_loop_location. (enum partition_kind): Remove PKIND_REDUCTION. (struct partition_s): Remove has_writes member, add reduction_p member. (partition_alloc): Adjust. (partition_builtin_p): Likewise. (partition_has_writes): Remove. (partition_reduction_p): New function. (partition_merge_into): Likewise. (generate_code_for_partition): Commonize builtin partition handling tail. (rdg_cannot_recompute_vertex_p): Remove. (already_processed_vertex_p): Likewise. (rdg_flag_vertex): Do not set has_writes. (classify_partition): Adjust. (rdg_build_partitions): Do not set has_writes, treat all partitions as useful. (distribute_loop): Record number of library calls generated. Adjust. (tree_loop_distribution): Report number of loops and library calls generated as opt-info. * gcc.dg/tree-ssa/ldist-11.c: Adjust. * gcc.dg/tree-ssa/ldist-17.c: Likewise. * gcc.dg/tree-ssa/ldist-23.c: Likewise. * gcc.dg/tree-ssa/ldist-pr45948.c: Likewise. * gfortran.dg/ldist-pr45199.f: Likewise. Index: gcc/tree-loop-distribution.c =================================================================== *** gcc/tree-loop-distribution.c.orig 2013-10-02 14:42:48.000000000 +0200 --- gcc/tree-loop-distribution.c 2013-10-02 14:49:05.775900940 +0200 *************** along with GCC; see the file COPYING3. *** 51,56 **** --- 51,57 ---- #include "tree-scalar-evolution.h" #include "tree-pass.h" #include "gimple-pretty-print.h" + #include "tree-vectorizer.h" /* A Reduced Dependence Graph (RDG) vertex representing a statement. */ *************** build_rdg (vec<loop_p> loop_nest, contro *** 557,570 **** enum partition_kind { ! PKIND_NORMAL, PKIND_REDUCTION, PKIND_MEMSET, PKIND_MEMCPY }; typedef struct partition_s { bitmap stmts; bitmap loops; ! bool has_writes; enum partition_kind kind; /* data-references a kind != PKIND_NORMAL partition is about. */ data_reference_p main_dr; --- 558,571 ---- enum partition_kind { ! PKIND_NORMAL, PKIND_MEMSET, PKIND_MEMCPY }; typedef struct partition_s { bitmap stmts; bitmap loops; ! bool reduction_p; enum partition_kind kind; /* data-references a kind != PKIND_NORMAL partition is about. */ data_reference_p main_dr; *************** partition_alloc (bitmap stmts, bitmap lo *** 581,587 **** partition_t partition = XCNEW (struct partition_s); partition->stmts = stmts ? stmts : BITMAP_ALLOC (NULL); partition->loops = loops ? loops : BITMAP_ALLOC (NULL); ! partition->has_writes = false; partition->kind = PKIND_NORMAL; return partition; } --- 582,588 ---- partition_t partition = XCNEW (struct partition_s); partition->stmts = stmts ? stmts : BITMAP_ALLOC (NULL); partition->loops = loops ? loops : BITMAP_ALLOC (NULL); ! partition->reduction_p = false; partition->kind = PKIND_NORMAL; return partition; } *************** partition_free (partition_t partition) *** 601,617 **** static bool partition_builtin_p (partition_t partition) { ! return partition->kind > PKIND_REDUCTION; } ! /* Returns true if the partition has an writes. */ static bool ! partition_has_writes (partition_t partition) { ! return partition->has_writes; } /* Returns true when DEF is an SSA_NAME defined in LOOP and used after the LOOP. */ --- 602,630 ---- static bool partition_builtin_p (partition_t partition) { ! return partition->kind != PKIND_NORMAL; } ! /* Returns true if the partition contains a reduction. */ static bool ! partition_reduction_p (partition_t partition) { ! return partition->reduction_p; } + /* Merge PARTITION into the partition DEST. */ + + static void + partition_merge_into (partition_t dest, partition_t partition) + { + dest->kind = PKIND_NORMAL; + bitmap_ior_into (dest->stmts, partition->stmts); + if (partition_reduction_p (partition)) + dest->reduction_p = true; + } + + /* Returns true when DEF is an SSA_NAME defined in LOOP and used after the LOOP. */ *************** generate_code_for_partition (struct loop *** 998,1055 **** { switch (partition->kind) { case PKIND_MEMSET: generate_memset_builtin (loop, partition); - /* If this is the last partition for which we generate code, we have - to destroy the loop. */ - if (!copy_p) - destroy_loop (loop); break; case PKIND_MEMCPY: generate_memcpy_builtin (loop, partition); - /* If this is the last partition for which we generate code, we have - to destroy the loop. */ - if (!copy_p) - destroy_loop (loop); - break; - - case PKIND_REDUCTION: - /* Reductions all have to be in the last partition. */ - gcc_assert (!copy_p); - case PKIND_NORMAL: - generate_loops_for_partition (loop, partition, copy_p); break; default: gcc_unreachable (); } - } - - - /* Returns true if the node V of RDG cannot be recomputed. */ - - static bool - rdg_cannot_recompute_vertex_p (struct graph *rdg, int v) - { - if (RDG_MEM_WRITE_STMT (rdg, v)) - return true; ! return false; ! } ! ! /* Returns true when the vertex V has already been generated in the ! current partition (V is in PROCESSED), or when V belongs to another ! partition and cannot be recomputed (V is not in REMAINING_STMTS). */ ! ! static inline bool ! already_processed_vertex_p (bitmap processed, int v) ! { ! return bitmap_bit_p (processed, v); } - static void rdg_flag_vertex_and_dependent (struct graph *, int, partition_t, - bitmap); /* Flag V from RDG as part of PARTITION, and also flag its loop number in LOOPS. */ --- 1011,1041 ---- { switch (partition->kind) { + case PKIND_NORMAL: + /* Reductions all have to be in the last partition. */ + gcc_assert (!partition_reduction_p (partition) + || !copy_p); + generate_loops_for_partition (loop, partition, copy_p); + return; + case PKIND_MEMSET: generate_memset_builtin (loop, partition); break; case PKIND_MEMCPY: generate_memcpy_builtin (loop, partition); break; default: gcc_unreachable (); } ! /* Common tail for partitions we turn into a call. If this was the last ! partition for which we generate code, we have to destroy the loop. */ ! if (!copy_p) ! destroy_loop (loop); } /* Flag V from RDG as part of PARTITION, and also flag its loop number in LOOPS. */ *************** rdg_flag_vertex (struct graph *rdg, int *** 1064,1072 **** loop = loop_containing_stmt (RDG_STMT (rdg, v)); bitmap_set_bit (partition->loops, loop->num); - - if (rdg_cannot_recompute_vertex_p (rdg, v)) - partition->has_writes = true; } /* Flag in the bitmap PARTITION the vertex V and all its predecessors. --- 1050,1055 ---- *************** classify_partition (loop_p loop, struct *** 1127,1141 **** if (gimple_has_volatile_ops (stmt)) volatiles_p = true; ! /* If the stmt has uses outside of the loop fail. ! ??? If the stmt is generated in another partition that ! is not created as builtin we can ignore this. */ if (stmt_has_scalar_dependences_outside_loop (loop, stmt)) { ! if (dump_file && (dump_flags & TDF_DETAILS)) ! fprintf (dump_file, "not generating builtin, partition has " ! "scalar uses outside of the loop\n"); ! partition->kind = PKIND_REDUCTION; return; } } --- 1110,1119 ---- if (gimple_has_volatile_ops (stmt)) volatiles_p = true; ! /* If the stmt has uses outside of the loop mark it as reduction. */ if (stmt_has_scalar_dependences_outside_loop (loop, stmt)) { ! partition->reduction_p = true; return; } } *************** rdg_build_partitions (struct graph *rdg, *** 1356,1376 **** np = build_rdg_partition_for_vertex (rdg, v); bitmap_ior_into (partition->stmts, np->stmts); - partition->has_writes = partition_has_writes (np); bitmap_ior_into (processed, np->stmts); partition_free (np); ! if (partition_has_writes (partition)) { ! if (dump_file && (dump_flags & TDF_DETAILS)) ! { ! fprintf (dump_file, "ldist useful partition:\n"); ! dump_bitmap (dump_file, partition->stmts); ! } ! ! partitions->safe_push (partition); ! partition = partition_alloc (NULL, NULL); } } /* All vertices should have been assigned to at least one partition now, --- 1334,1350 ---- np = build_rdg_partition_for_vertex (rdg, v); bitmap_ior_into (partition->stmts, np->stmts); bitmap_ior_into (processed, np->stmts); partition_free (np); ! if (dump_file && (dump_flags & TDF_DETAILS)) { ! fprintf (dump_file, "ldist useful partition:\n"); ! dump_bitmap (dump_file, partition->stmts); } + + partitions->safe_push (partition); + partition = partition_alloc (NULL, NULL); } /* All vertices should have been assigned to at least one partition now, *************** partition_contains_all_rw (struct graph *** 1480,1486 **** static int distribute_loop (struct loop *loop, vec<gimple> stmts, ! control_dependences *cd) { struct graph *rdg; vec<loop_p> loop_nest; --- 1454,1460 ---- static int distribute_loop (struct loop *loop, vec<gimple> stmts, ! control_dependences *cd, int *nb_calls) { struct graph *rdg; vec<loop_p> loop_nest; *************** distribute_loop (struct loop *loop, vec< *** 1489,1494 **** --- 1463,1469 ---- bool any_builtin; int i, nbp; + *nb_calls = 0; loop_nest.create (3); if (!find_loop_nest (loop, &loop_nest)) { *************** distribute_loop (struct loop *loop, vec< *** 1551,1559 **** fprintf (dump_file, "because they have similar " "memory accesses\n"); } ! bitmap_ior_into (into->stmts, partition->stmts); ! if (partition->kind == PKIND_REDUCTION) ! into->kind = PKIND_REDUCTION; partitions.ordered_remove (j); partition_free (partition); j--; --- 1526,1532 ---- fprintf (dump_file, "because they have similar " "memory accesses\n"); } ! partition_merge_into (into, partition); partitions.ordered_remove (j); partition_free (partition); j--; *************** distribute_loop (struct loop *loop, vec< *** 1577,1585 **** for (++i; partitions.iterate (i, &partition); ++i) if (!partition_builtin_p (partition)) { ! bitmap_ior_into (into->stmts, partition->stmts); ! if (partition->kind == PKIND_REDUCTION) ! into->kind = PKIND_REDUCTION; partitions.ordered_remove (i); partition_free (partition); i--; --- 1550,1556 ---- for (++i; partitions.iterate (i, &partition); ++i) if (!partition_builtin_p (partition)) { ! partition_merge_into (into, partition); partitions.ordered_remove (i); partition_free (partition); i--; *************** distribute_loop (struct loop *loop, vec< *** 1597,1603 **** for (i = partitions.length () - 2; i >= 0; --i) { partition_t what = partitions[i]; ! if (what->kind == PKIND_REDUCTION) { if (dump_file && (dump_flags & TDF_DETAILS)) { --- 1568,1574 ---- for (i = partitions.length () - 2; i >= 0; --i) { partition_t what = partitions[i]; ! if (partition_reduction_p (what)) { if (dump_file && (dump_flags & TDF_DETAILS)) { *************** distribute_loop (struct loop *loop, vec< *** 1606,1613 **** dump_bitmap (dump_file, what->stmts); fprintf (dump_file, "because the latter has reductions\n"); } ! bitmap_ior_into (into->stmts, what->stmts); ! into->kind = PKIND_REDUCTION; partitions.ordered_remove (i); partition_free (what); } --- 1577,1583 ---- dump_bitmap (dump_file, what->stmts); fprintf (dump_file, "because the latter has reductions\n"); } ! partition_merge_into (into, what); partitions.ordered_remove (i); partition_free (what); } *************** distribute_loop (struct loop *loop, vec< *** 1627,1633 **** dump_rdg_partitions (dump_file, partitions); FOR_EACH_VEC_ELT (partitions, i, partition) ! generate_code_for_partition (loop, partition, i < nbp - 1); ldist_done: --- 1597,1607 ---- dump_rdg_partitions (dump_file, partitions); FOR_EACH_VEC_ELT (partitions, i, partition) ! { ! if (partition_builtin_p (partition)) ! (*nb_calls)++; ! generate_code_for_partition (loop, partition, i < nbp - 1); ! } ldist_done: *************** distribute_loop (struct loop *loop, vec< *** 1637,1643 **** free_rdg (rdg); loop_nest.release (); ! return nbp; } /* Distribute all loops in the current function. */ --- 1611,1617 ---- free_rdg (rdg); loop_nest.release (); ! return nbp - *nb_calls; } /* Distribute all loops in the current function. */ *************** tree_loop_distribution (void) *** 1667,1673 **** vec<gimple> work_list = vNULL; basic_block *bbs; int num = loop->num; - int nb_generated_loops = 0; unsigned int i; /* If the loop doesn't have a single exit we will fail anyway, --- 1641,1646 ---- *************** tree_loop_distribution (void) *** 1722,1727 **** --- 1695,1703 ---- out: free (bbs); + int nb_generated_loops = 0; + int nb_generated_calls = 0; + location_t loc = find_loop_location (loop); if (work_list.length () > 0) { if (!cd) *************** out: *** 1731,1750 **** cd = new control_dependences (create_edge_list ()); free_dominance_info (CDI_POST_DOMINATORS); } ! nb_generated_loops = distribute_loop (loop, work_list, cd); } ! if (nb_generated_loops > 0) ! changed = true; ! ! if (dump_file && (dump_flags & TDF_DETAILS)) { ! if (nb_generated_loops > 1) ! fprintf (dump_file, "Loop %d distributed: split to %d loops.\n", ! num, nb_generated_loops); ! else ! fprintf (dump_file, "Loop %d is the same.\n", num); } work_list.release (); } --- 1707,1726 ---- cd = new control_dependences (create_edge_list ()); free_dominance_info (CDI_POST_DOMINATORS); } ! nb_generated_loops = distribute_loop (loop, work_list, cd, ! &nb_generated_calls); } ! if (nb_generated_loops + nb_generated_calls > 0) { ! changed = true; ! dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, ! loc, "Loop %d distributed: split to %d loops " ! "and %d library calls.\n", ! num, nb_generated_loops, nb_generated_calls); } + else if (dump_file && (dump_flags & TDF_DETAILS)) + fprintf (dump_file, "Loop %d is the same.\n", num); work_list.release (); } Index: gcc/testsuite/gcc.dg/tree-ssa/ldist-11.c =================================================================== *** gcc/testsuite/gcc.dg/tree-ssa/ldist-11.c.orig 2013-10-02 14:42:48.000000000 +0200 --- gcc/testsuite/gcc.dg/tree-ssa/ldist-11.c 2013-10-02 14:42:56.874651935 +0200 *************** void foo (int * __restrict__ ia, *** 28,33 **** */ } ! /* { dg-final { scan-tree-dump-times "distributed: split to 2 loops" 1 "ldist" } } */ /* { dg-final { scan-tree-dump-times "generated memset zero" 1 "ldist" } } */ /* { dg-final { cleanup-tree-dump "ldist" } } */ --- 28,33 ---- */ } ! /* { dg-final { scan-tree-dump-times "distributed: split to 1 loops and 1 library calls" 1 "ldist" } } */ /* { dg-final { scan-tree-dump-times "generated memset zero" 1 "ldist" } } */ /* { dg-final { cleanup-tree-dump "ldist" } } */ Index: gcc/testsuite/gcc.dg/tree-ssa/ldist-17.c =================================================================== *** gcc/testsuite/gcc.dg/tree-ssa/ldist-17.c.orig 2013-10-02 14:42:48.000000000 +0200 --- gcc/testsuite/gcc.dg/tree-ssa/ldist-17.c 2013-10-02 14:42:56.874651935 +0200 *************** mad_synth_mute (struct mad_synth *synth) *** 45,50 **** return; } ! /* { dg-final { scan-tree-dump "distributed: split to 4" "ldist" } } */ /* { dg-final { scan-tree-dump-times "generated memset zero" 4 "ldist" } } */ /* { dg-final { cleanup-tree-dump "ldist" } } */ --- 45,50 ---- return; } ! /* { dg-final { scan-tree-dump "distributed: split to 0 loops and 4 library calls" "ldist" } } */ /* { dg-final { scan-tree-dump-times "generated memset zero" 4 "ldist" } } */ /* { dg-final { cleanup-tree-dump "ldist" } } */ Index: gcc/testsuite/gcc.dg/tree-ssa/ldist-23.c =================================================================== *** gcc/testsuite/gcc.dg/tree-ssa/ldist-23.c.orig 2013-10-02 14:42:48.000000000 +0200 --- gcc/testsuite/gcc.dg/tree-ssa/ldist-23.c 2013-10-02 14:42:56.875651946 +0200 *************** int main() *** 29,34 **** return 0; } ! /* { dg-final { scan-tree-dump "split to 2 loops" "ldist" } } */ /* { dg-final { scan-tree-dump "generated memcpy" "ldist" } } */ /* { dg-final { cleanup-tree-dump "ldist" } } */ --- 29,34 ---- return 0; } ! /* { dg-final { scan-tree-dump "split to 1 loops and 1 library call" "ldist" } } */ /* { dg-final { scan-tree-dump "generated memcpy" "ldist" } } */ /* { dg-final { cleanup-tree-dump "ldist" } } */ Index: gcc/testsuite/gcc.dg/tree-ssa/ldist-pr45948.c =================================================================== *** gcc/testsuite/gcc.dg/tree-ssa/ldist-pr45948.c.orig 2013-10-02 14:42:48.000000000 +0200 --- gcc/testsuite/gcc.dg/tree-ssa/ldist-pr45948.c 2013-10-02 14:42:56.875651946 +0200 *************** foo (int i, int n) *** 18,23 **** /* We should apply loop distribution and generate 2 memset (0). */ ! /* { dg-final { scan-tree-dump "distributed: split to 2" "ldist" } } */ /* { dg-final { scan-tree-dump-times "generated memset zero" 2 "ldist" } } */ /* { dg-final { cleanup-tree-dump "ldist" } } */ --- 18,23 ---- /* We should apply loop distribution and generate 2 memset (0). */ ! /* { dg-final { scan-tree-dump "distributed: split to 0 loops and 2 library calls" "ldist" } } */ /* { dg-final { scan-tree-dump-times "generated memset zero" 2 "ldist" } } */ /* { dg-final { cleanup-tree-dump "ldist" } } */ Index: gcc/testsuite/gfortran.dg/ldist-pr45199.f =================================================================== *** gcc/testsuite/gfortran.dg/ldist-pr45199.f.orig 2012-01-30 16:44:51.000000000 +0100 --- gcc/testsuite/gfortran.dg/ldist-pr45199.f 2013-10-02 14:43:54.388314258 +0200 *************** *** 22,27 **** ! GCC should apply memset zero loop distribution and it should not ICE. ! ! { dg-final { scan-tree-dump "distributed: split to 9 loops" "ldist" } } ! { dg-final { scan-tree-dump-times "generated memset zero" 9 "ldist" } } ! { dg-final { cleanup-tree-dump "ldist" } } --- 22,27 ---- ! GCC should apply memset zero loop distribution and it should not ICE. ! ! { dg-final { scan-tree-dump "distributed: split to 0 loops and 9 library calls" "ldist" } } ! { dg-final { scan-tree-dump-times "generated memset zero" 9 "ldist" } } ! { dg-final { cleanup-tree-dump "ldist" } }