On 17 July 2015 at 06:32, Tobias Grosser <tob...@grosser.es> wrote: > On 07/17/2015 12:35 AM, Sebastian Pop wrote: >> >> gcc/ChangeLog: >> >> 2015-07-16 Aditya Kumar <aditya...@samsung.com> >> Sebastian Pop <s....@samsung.com> >> >> * common.opt (floop-fuse): New. >> * doc/invoke.texi (floop-fuse): Documented. >> * graphite-optimize-isl.c (optimize_isl): Use >> ISL_SCHEDULE_FUSE_MAX when using flag_loop_fuse. >> * graphite-poly.c (apply_poly_transforms): Call optimize_isl when >> using flag_loop_fuse. >> * graphite.c (gate_graphite_transforms): Enable graphite with >> flag_loop_fuse. > > > LGTM.
AFAICS this won't work with isl-0.15.0 where this spot changed. See https://gcc.gnu.org/ml/gcc-patches/2015-07/msg01162.html and ff. Apart from that it looks identical in spirit to what came out of starting to think about the fusion part of https://gcc.gnu.org/PR66741 ;) Didn't have time to pursue this yet due to RL interference though, so if you beat me to it.. Thanks, > > Tobias > >> gcc/testsuite/ChangeLog: >> >> 2015-07-16 Aditya Kumar <aditya...@samsung.com> >> Sebastian Pop <s....@samsung.com> >> >> >> * gcc.dg/graphite/fuse-1.c: New test. >> * gcc.dg/graphite/fuse-2.c: New test. >> --- >> gcc/common.opt | 4 ++++ >> gcc/doc/invoke.texi | 23 +++++++++++++++++++- >> gcc/graphite-optimize-isl.c | 5 ++++- >> gcc/graphite-poly.c | 2 +- >> gcc/graphite.c | 3 ++- >> gcc/testsuite/gcc.dg/graphite/fuse-1.c | 32 ++++++++++++++++++++++++++++ >> gcc/testsuite/gcc.dg/graphite/fuse-2.c | 38 >> ++++++++++++++++++++++++++++++++++ >> 7 files changed, 103 insertions(+), 4 deletions(-) >> create mode 100644 gcc/testsuite/gcc.dg/graphite/fuse-1.c >> create mode 100644 gcc/testsuite/gcc.dg/graphite/fuse-2.c >> >> diff --git a/gcc/common.opt b/gcc/common.opt >> index dd49ae3..200ecc1 100644 >> --- a/gcc/common.opt >> +++ b/gcc/common.opt >> @@ -1365,6 +1365,10 @@ floop-nest-optimize >> Common Report Var(flag_loop_optimize_isl) Optimization >> Enable the ISL based loop nest optimizer >> >> +floop-fuse >> +Common Report Var(flag_loop_fuse) Optimization >> +Enable loop fusion >> + >> fstrict-volatile-bitfields >> Common Report Var(flag_strict_volatile_bitfields) Init(-1) Optimization >> Force bitfield accesses to match their type width >> diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi >> index b99ab1c..7cc8bb9 100644 >> --- a/gcc/doc/invoke.texi >> +++ b/gcc/doc/invoke.texi >> @@ -409,7 +409,7 @@ Objective-C and Objective-C++ Dialects}. >> -fivopts -fkeep-inline-functions -fkeep-static-consts @gol >> -flive-range-shrinkage @gol >> -floop-block -floop-interchange -floop-strip-mine @gol >> --floop-unroll-and-jam -floop-nest-optimize @gol >> +-floop-unroll-and-jam -floop-nest-optimize -floop-fuse @gol >> -floop-parallelize-all -flra-remat -flto -flto-compression-level @gol >> -flto-partition=@var{alg} -flto-report -flto-report-wpa >> -fmerge-all-constants @gol >> -fmerge-constants -fmodulo-sched -fmodulo-sched-allow-regmoves @gol >> @@ -8796,6 +8796,27 @@ optimizer based on the Pluto optimization >> algorithms. It calculates a loop >> structure optimized for data-locality and parallelism. This option >> is experimental. >> >> +@item -floop-fuse >> +@opindex floop-fuse >> +Enable loop fusion. This option is experimental. >> + >> +For example, given a loop like: >> +@smallexample >> +DO I = 1, N >> + A(I) = A(I) + B(I) >> +ENDDO >> +DO I = 1, N >> + A(I) = A(I) + C(I) >> +ENDDO >> +@end smallexample >> +@noindent >> +loop fusion transforms the loop as if it were written: >> +@smallexample >> +DO I = 1, N >> + A(I) = A(I) + B(I) + C(I) >> +ENDDO >> +@end smallexample >> + >> @item -floop-unroll-and-jam >> @opindex floop-unroll-and-jam >> Enable unroll and jam for the ISL based loop nest optimizer. The unroll >> diff --git a/gcc/graphite-optimize-isl.c b/gcc/graphite-optimize-isl.c >> index 624cc87..c016461 100644 >> --- a/gcc/graphite-optimize-isl.c >> +++ b/gcc/graphite-optimize-isl.c >> @@ -599,7 +599,10 @@ optimize_isl (scop_p scop) >> >> isl_options_set_schedule_max_constant_term (scop->ctx, >> CONSTANT_BOUND); >> isl_options_set_schedule_maximize_band_depth (scop->ctx, 1); >> - isl_options_set_schedule_fuse (scop->ctx, ISL_SCHEDULE_FUSE_MIN); >> + if (flag_loop_fuse) >> + isl_options_set_schedule_fuse (scop->ctx, ISL_SCHEDULE_FUSE_MAX); >> + else >> + isl_options_set_schedule_fuse (scop->ctx, ISL_SCHEDULE_FUSE_MIN); >> isl_options_set_on_error (scop->ctx, ISL_ON_ERROR_CONTINUE); >> >> #ifdef HAVE_ISL_SCHED_CONSTRAINTS_COMPUTE_SCHEDULE >> diff --git a/gcc/graphite-poly.c b/gcc/graphite-poly.c >> index 4407dc5..4808fbe 100644 >> --- a/gcc/graphite-poly.c >> +++ b/gcc/graphite-poly.c >> @@ -272,7 +272,7 @@ apply_poly_transforms (scop_p scop) >> >> /* This pass needs to be run at the final stage, as it does not >> update the lst. */ >> - if (flag_loop_optimize_isl || flag_loop_unroll_jam) >> + if (flag_loop_optimize_isl || flag_loop_unroll_jam || flag_loop_fuse) >> transform_done |= optimize_isl (scop); >> >> return transform_done; >> diff --git a/gcc/graphite.c b/gcc/graphite.c >> index ba8029a..51af1a2a 100644 >> --- a/gcc/graphite.c >> +++ b/gcc/graphite.c >> @@ -342,7 +342,8 @@ gate_graphite_transforms (void) >> || flag_graphite_identity >> || flag_loop_parallelize_all >> || flag_loop_optimize_isl >> - || flag_loop_unroll_jam) >> + || flag_loop_unroll_jam >> + || flag_loop_fuse) >> flag_graphite = 1; >> >> return flag_graphite != 0; >> diff --git a/gcc/testsuite/gcc.dg/graphite/fuse-1.c >> b/gcc/testsuite/gcc.dg/graphite/fuse-1.c >> new file mode 100644 >> index 0000000..f368f47 >> --- /dev/null >> +++ b/gcc/testsuite/gcc.dg/graphite/fuse-1.c >> @@ -0,0 +1,32 @@ >> +/* Check that the two loops are fused and that we manage to fold the two >> xor >> + operations. */ >> +/* { dg-options "-O2 -floop-fuse -fdump-tree-forwprop-all" } */ >> +/* { dg-final { scan-tree-dump-times "gimple_simplified to\[^\\n\]*\\^ >> 12" 1 "forwprop4" } } */ >> +/* { dg-do run } */ >> + >> +#define MAX 100 >> +int A[MAX]; >> + >> +extern void abort (); >> + >> +void fuse() { >> +} >> + >> +int >> +main (void) >> +{ >> + int i; >> + >> + for (i = 0; i < MAX; i++) >> + A[i] = i; >> + for(int i=0; i<MAX; i++) >> + A[i] ^= 4; >> + for(int i=0; i<MAX; i++) >> + A[i] ^= 8; >> + >> + for (i = 0; i < MAX; i++) >> + if (A[i] != (i ^ 12)) >> + abort (); >> + >> + return 0; >> +} >> diff --git a/gcc/testsuite/gcc.dg/graphite/fuse-2.c >> b/gcc/testsuite/gcc.dg/graphite/fuse-2.c >> new file mode 100644 >> index 0000000..e1a1cb3 >> --- /dev/null >> +++ b/gcc/testsuite/gcc.dg/graphite/fuse-2.c >> @@ -0,0 +1,38 @@ >> +/* Check that the three loops are fused. */ >> +/* { dg-options "-O2 -floop-fuse" } */ >> +/* { dg-do run } */ >> + >> +/* FIXME: Add a graphite dump mechanism to print the number of loops >> generated >> + by ISL and pattern match it. */ >> + >> +#define MAX 100 >> +int A[MAX], B[MAX], C[MAX]; >> + >> +extern void abort (); >> + >> +void fuse() { >> +} >> + >> +int >> +main (void) >> +{ >> + int i; >> + >> + /* The next three loops should be fused. */ >> + for (i = 0; i < MAX; i++) >> + { >> + A[i] = i; >> + B[i] = i + 2; >> + C[i] = i + 1; >> + } >> + for(int i=0; i<MAX; i++) >> + A[i] += B[i]; >> + for(int i=0; i<MAX; i++) >> + A[i] += C[i]; >> + >> + for (i = 0; i < MAX; i++) >> + if (A[i] != 3*i+3) >> + abort (); >> + >> + return 0; >> +} >> >