On Fri, 25 Oct 2019, Jiufu Guo wrote: > Hi, > > In PR88760, there are a few disscussion about improve or tune unroller for > targets. And we would agree to enable unroller for small loops at O2 first. > And we could see performance improvement(~10%) for below code: > ``` > subroutine foo (i, i1, block) > integer :: i, i1 > integer :: block(9, 9, 9) > block(i:9,1,i1) = block(i:9,1,i1) - 10 > end subroutine foo > > ``` > This kind of code occurs a few times in exchange2 benchmark. > > Similar C code: > ``` > for (i = 0; i < n; i++) > arr[i] = arr[i] - 10; > ``` > > On powerpc64le, for O2 , enable -funroll-loops and limit > PARAM_MAX_UNROLL_TIMES=2 and PARAM_MAX_UNROLLED_INSNS=20, we can see >2% > overall improvement for SPEC2017.
Note the behavior with LTO will be surprising (and generally when functions are compiled with different -O level via the optimize attribute). It's a bad idea to base --param values on the value of a global optimization option that can be set per function (see PR92046). A better patch would have been to adjust via the target hooks for unroll adjust. Thanks, Richard. > This patch is only for rs6000 in which we see visible performance improvement. > > Bootstrapped on powerpc64le, and cases are updated. Is this ok for trunk? > > Jiufu Guo > BR > > > gcc/ > 2019-10-25 Jiufu Guo <guoji...@linux.ibm.com> > > PR tree-optimization/88760 > * config/rs6000/rs6000-common.c (rs6000_option_optimization_table): for > O2, enable funroll-loops. > * config/rs6000/rs6000.c (rs6000_option_override_internal): if unroller > is enabled throught O2, set constrains to PARAM_MAX_UNROLL_TIMES=2 and > PARAM_MAX_UNROLLED_INSNS=20 for small loops. > > gcc.testsuite/ > 2019-10-25 Jiufu Guo <guoji...@linux.ibm.com> > > PR tree-optimization/88760 > * gcc.target/powerpc/small-loop-unroll.c: New test. > * c-c++-common/tsan/thread_leak2.c: Update test. > * gcc.dg/pr59643.c: Update test. > * gcc.target/powerpc/loop_align.c: Update test. > * gcc.target/powerpc/ppc-fma-1.c: Update test. > * gcc.target/powerpc/ppc-fma-2.c: Update test. > * gcc.target/powerpc/ppc-fma-3.c: Update test. > * gcc.target/powerpc/ppc-fma-4.c: Update test. > * gcc.target/powerpc/pr78604.c: Update test. > > --- > gcc/common/config/rs6000/rs6000-common.c | 1 + > gcc/config/rs6000/rs6000.c | 20 > ++++++++++++++++++++ > gcc/testsuite/c-c++-common/tsan/thread_leak2.c | 1 + > gcc/testsuite/gcc.dg/pr59643.c | 1 + > gcc/testsuite/gcc.target/powerpc/loop_align.c | 2 +- > gcc/testsuite/gcc.target/powerpc/ppc-fma-1.c | 2 +- > gcc/testsuite/gcc.target/powerpc/ppc-fma-2.c | 2 +- > gcc/testsuite/gcc.target/powerpc/ppc-fma-3.c | 2 +- > gcc/testsuite/gcc.target/powerpc/ppc-fma-4.c | 2 +- > gcc/testsuite/gcc.target/powerpc/pr78604.c | 2 +- > gcc/testsuite/gcc.target/powerpc/small-loop-unroll.c | 13 +++++++++++++ > 11 files changed, 42 insertions(+), 6 deletions(-) > create mode 100644 gcc/testsuite/gcc.target/powerpc/small-loop-unroll.c > > diff --git a/gcc/common/config/rs6000/rs6000-common.c > b/gcc/common/config/rs6000/rs6000-common.c > index 4b0c205..b947196 100644 > --- a/gcc/common/config/rs6000/rs6000-common.c > +++ b/gcc/common/config/rs6000/rs6000-common.c > @@ -35,6 +35,7 @@ static const struct default_options > rs6000_option_optimization_table[] = > { OPT_LEVELS_ALL, OPT_fsplit_wide_types_early, NULL, 1 }, > /* Enable -fsched-pressure for first pass instruction scheduling. */ > { OPT_LEVELS_1_PLUS, OPT_fsched_pressure, NULL, 1 }, > + { OPT_LEVELS_2_PLUS, OPT_funroll_loops, NULL, 1 }, > { OPT_LEVELS_NONE, 0, NULL, 0 } > }; > > diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c > index a129137d..9a8ff9a 100644 > --- a/gcc/config/rs6000/rs6000.c > +++ b/gcc/config/rs6000/rs6000.c > @@ -4540,6 +4540,26 @@ rs6000_option_override_internal (bool global_init_p) > global_options.x_param_values, > global_options_set.x_param_values); > > + /* unroll very small loops 2 time if no -funroll-loops. */ > + if (!global_options_set.x_flag_unroll_loops > + && !global_options_set.x_flag_unroll_all_loops) > + { > + maybe_set_param_value (PARAM_MAX_UNROLL_TIMES, 2, > + global_options.x_param_values, > + global_options_set.x_param_values); > + > + maybe_set_param_value (PARAM_MAX_UNROLLED_INSNS, 20, > + global_options.x_param_values, > + global_options_set.x_param_values); > + > + /* If fweb or frename-registers are not specificed in command-line, > + do not turn them on implicitly. */ > + if (!global_options_set.x_flag_web) > + global_options.x_flag_web = 0; > + if (!global_options_set.x_flag_rename_registers) > + global_options.x_flag_rename_registers = 0; > + } > + > /* If using typedef char *va_list, signal that > __builtin_va_start (&ap, 0) can be optimized to > ap = __builtin_next_arg (0). */ > diff --git a/gcc/testsuite/c-c++-common/tsan/thread_leak2.c > b/gcc/testsuite/c-c++-common/tsan/thread_leak2.c > index c9b8046..17aa5c6 100644 > --- a/gcc/testsuite/c-c++-common/tsan/thread_leak2.c > +++ b/gcc/testsuite/c-c++-common/tsan/thread_leak2.c > @@ -1,4 +1,5 @@ > /* { dg-shouldfail "tsan" } */ > +/* { dg-additional-options "-fno-unroll-loops" { target { powerpc-*-* > powerpc64-*-* powerpc64le-*-* } } } */ > > #include <pthread.h> > #include <unistd.h> > diff --git a/gcc/testsuite/gcc.dg/pr59643.c b/gcc/testsuite/gcc.dg/pr59643.c > index de78d60..3aef439 100644 > --- a/gcc/testsuite/gcc.dg/pr59643.c > +++ b/gcc/testsuite/gcc.dg/pr59643.c > @@ -1,6 +1,7 @@ > /* PR tree-optimization/59643 */ > /* { dg-do compile } */ > /* { dg-options "-O3 -fdump-tree-pcom-details" } */ > +/* { dg-additional-options "-fno-unroll-loops" { target { powerpc-*-* > powerpc64-*-* powerpc64le-*-* } } } */ > > void > foo (double *a, double *b, double *c, double d, double e, int n) > diff --git a/gcc/testsuite/gcc.target/powerpc/loop_align.c > b/gcc/testsuite/gcc.target/powerpc/loop_align.c > index ebe3782..ef67f77 100644 > --- a/gcc/testsuite/gcc.target/powerpc/loop_align.c > +++ b/gcc/testsuite/gcc.target/powerpc/loop_align.c > @@ -1,6 +1,6 @@ > /* { dg-do compile { target { powerpc*-*-* } } } */ > /* { dg-skip-if "" { powerpc*-*-darwin* powerpc-ibm-aix* } } */ > -/* { dg-options "-O2 -mdejagnu-cpu=power7 -falign-functions=16" } */ > +/* { dg-options "-O2 -mdejagnu-cpu=power7 -falign-functions=16 > -fno-unroll-loops" } */ > /* { dg-final { scan-assembler ".p2align 5" } } */ > > void f(double *a, double *b, double *c, unsigned long n) { > diff --git a/gcc/testsuite/gcc.target/powerpc/ppc-fma-1.c > b/gcc/testsuite/gcc.target/powerpc/ppc-fma-1.c > index b4945e6..2a5b92c 100644 > --- a/gcc/testsuite/gcc.target/powerpc/ppc-fma-1.c > +++ b/gcc/testsuite/gcc.target/powerpc/ppc-fma-1.c > @@ -1,7 +1,7 @@ > /* { dg-do compile { target { powerpc*-*-* } } } */ > /* { dg-skip-if "" { powerpc*-*-darwin* } } */ > /* { dg-require-effective-target powerpc_vsx_ok } */ > -/* { dg-options "-O3 -ftree-vectorize -mdejagnu-cpu=power7 -ffast-math" } */ > +/* { dg-options "-O3 -ftree-vectorize -mdejagnu-cpu=power7 -ffast-math > -fno-unroll-loops" } */ > /* { dg-final { scan-assembler-times "xvmadd" 4 } } */ > /* { dg-final { scan-assembler-times "xsmadd\|fmadd\ " 2 } } */ > /* { dg-final { scan-assembler-times "fmadds" 2 } } */ > diff --git a/gcc/testsuite/gcc.target/powerpc/ppc-fma-2.c > b/gcc/testsuite/gcc.target/powerpc/ppc-fma-2.c > index 5ed630a..bf2c67f 100644 > --- a/gcc/testsuite/gcc.target/powerpc/ppc-fma-2.c > +++ b/gcc/testsuite/gcc.target/powerpc/ppc-fma-2.c > @@ -1,7 +1,7 @@ > /* { dg-do compile { target { powerpc*-*-* } } } */ > /* { dg-skip-if "" { powerpc*-*-darwin* } } */ > /* { dg-require-effective-target powerpc_vsx_ok } */ > -/* { dg-options "-O3 -ftree-vectorize -mdejagnu-cpu=power7 -ffast-math > -ffp-contract=off" } */ > +/* { dg-options "-O3 -ftree-vectorize -mdejagnu-cpu=power7 -ffast-math > -ffp-contract=off -fno-unroll-loops" } */ > /* { dg-final { scan-assembler-times "xvmadd" 2 } } */ > /* { dg-final { scan-assembler-times "xsmadd\|fmadd\ " 1 } } */ > /* { dg-final { scan-assembler-times "fmadds" 1 } } */ > diff --git a/gcc/testsuite/gcc.target/powerpc/ppc-fma-3.c > b/gcc/testsuite/gcc.target/powerpc/ppc-fma-3.c > index ef252b3..8608116 100644 > --- a/gcc/testsuite/gcc.target/powerpc/ppc-fma-3.c > +++ b/gcc/testsuite/gcc.target/powerpc/ppc-fma-3.c > @@ -2,7 +2,7 @@ > /* { dg-skip-if "" { powerpc*-*-darwin* } } */ > /* { dg-require-effective-target powerpc_altivec_ok } */ > /* { dg-require-effective-target powerpc_fprs } */ > -/* { dg-options "-O3 -ftree-vectorize -mdejagnu-cpu=power6 -maltivec > -ffast-math" } */ > +/* { dg-options "-O3 -ftree-vectorize -mdejagnu-cpu=power6 -maltivec > -ffast-math -fno-unroll-loops" } */ > /* { dg-final { scan-assembler-times "vmaddfp" 2 } } */ > /* { dg-final { scan-assembler-times "fmadd " 2 } } */ > /* { dg-final { scan-assembler-times "fmadds" 2 } } */ > diff --git a/gcc/testsuite/gcc.target/powerpc/ppc-fma-4.c > b/gcc/testsuite/gcc.target/powerpc/ppc-fma-4.c > index c2eaf1a..291c2ee 100644 > --- a/gcc/testsuite/gcc.target/powerpc/ppc-fma-4.c > +++ b/gcc/testsuite/gcc.target/powerpc/ppc-fma-4.c > @@ -2,7 +2,7 @@ > /* { dg-skip-if "" { powerpc*-*-darwin* } } */ > /* { dg-require-effective-target powerpc_altivec_ok } */ > /* { dg-require-effective-target powerpc_fprs } */ > -/* { dg-options "-O3 -ftree-vectorize -mdejagnu-cpu=power6 -maltivec > -ffast-math -ffp-contract=off" } */ > +/* { dg-options "-O3 -ftree-vectorize -mdejagnu-cpu=power6 -maltivec > -ffast-math -ffp-contract=off -fno-unroll-loops" } */ > /* { dg-final { scan-assembler-times "vmaddfp" 1 } } */ > /* { dg-final { scan-assembler-times "fmadd " 1 } } */ > /* { dg-final { scan-assembler-times "fmadds" 1 } } */ > diff --git a/gcc/testsuite/gcc.target/powerpc/pr78604.c > b/gcc/testsuite/gcc.target/powerpc/pr78604.c > index 76d8945..35bfdb3 100644 > --- a/gcc/testsuite/gcc.target/powerpc/pr78604.c > +++ b/gcc/testsuite/gcc.target/powerpc/pr78604.c > @@ -1,7 +1,7 @@ > /* { dg-do compile { target { powerpc*-*-* } } } */ > /* { dg-skip-if "" { powerpc*-*-darwin* } } */ > /* { dg-require-effective-target powerpc_p8vector_ok } */ > -/* { dg-options "-mdejagnu-cpu=power8 -O2 -ftree-vectorize > -fdump-tree-vect-details" } */ > +/* { dg-options "-mdejagnu-cpu=power8 -O2 -ftree-vectorize > -fdump-tree-vect-details -fno-unroll-loops" } */ > > #ifndef SIZE > #define SIZE 1024 > diff --git a/gcc/testsuite/gcc.target/powerpc/small-loop-unroll.c > b/gcc/testsuite/gcc.target/powerpc/small-loop-unroll.c > new file mode 100644 > index 0000000..888ea50b > --- /dev/null > +++ b/gcc/testsuite/gcc.target/powerpc/small-loop-unroll.c > @@ -0,0 +1,13 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -fdump-rtl-loop2_unroll" } */ > + > +void __attribute__ ((noinline)) foo(int n, int *arr) > +{ > + int i; > + for (i = 0; i < n; i++) > + arr[i] = arr[i] - 10; > +} > +/* { dg-final { scan-rtl-dump-times "Unrolled loop 1 times" 1 "loop2_unroll" > } } */ > +/* { dg-final { scan-assembler-times "lwz" 3 } } */ > +/* { dg-final { scan-assembler-times "stw" 3 } } */ > + > -- Richard Biener <rguent...@suse.de> SUSE Software Solutions Germany GmbH, Maxfeldstrasse 5, 90409 Nuernberg, Germany; GF: Felix Imendörffer; HRB 36809 (AG Nuernberg)