[Bug tree-optimization/57204] New: Auto-vectorization in nested loops with non-varying indexed array access results in very poor performance (worse than no auto-vectorization)

snagavallis at outlook dot com Wed, 08 May 2013 02:09:50 -0700


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=57204




             Bug #: 57204

           Summary: Auto-vectorization in nested loops with non-varying

                    indexed array access results in very poor performance

                    (worse than no auto-vectorization)

    Classification: Unclassified

           Product: gcc

           Version: 4.7.3

            Status: UNCONFIRMED

          Severity: normal

          Priority: P3

         Component: tree-optimization

        AssignedTo: unassig...@gcc.gnu.org

        ReportedBy: snagaval...@outlook.com





In the good case below, auto-vectorization improves performance by a factor of

3. In the bad case, it actually decreases performance compared to no

auto-vectorization. 



Good:

void foo(float * d, int n)

{

  int i, j, k;

  for (k=0; k<n; ++k) {

    for (i=0; i<n; ++i) {

      float d_ik = d[i*n+k]; 

      for (j=0; j<n; ++j) {

        float t = d_ik + d[k*n+j];

        d[i*n+j] = (d[i*n+j] < t) ? d[i*n+j] : t;

      }

    }

  }

}



Bad:

void foo(float * d, int n)

{

  int i, j, k;

  for (k=0; k<n; ++k) {

    for (i=0; i<n; ++i) {

      for (j=0; j<n; ++j) {

        float t = d[i*n+k] + d[k*n+j];

        d[i*n+j] = (d[i*n+j] < t) ? d[i*n+j] : t;

      }

    }

  }

}



$ gcc -v

Using built-in specs.

COLLECT_GCC=gcc

COLLECT_LTO_WRAPPER=/usr/lib/gcc/x86_64-linux-gnu/4.7/lto-wrapper

Target: x86_64-linux-gnu

Configured with: ../src/configure -v --with-pkgversion='Ubuntu/Linaro

4.7.3-1ubuntu1' --with-bugurl=file:///usr/share/doc/gcc-4.7/README.Bugs

--enable-languages=c,c++,go,fortran,objc,obj-c++ --prefix=/usr

--program-suffix=-4.7 --enable-shared --enable-linker-build-id

--libexecdir=/usr/lib --without-included-gettext --enable-threads=posix

--with-gxx-include-dir=/usr/include/c++/4.7 --libdir=/usr/lib --enable-nls

--with-sysroot=/ --enable-clocale=gnu --enable-libstdcxx-debug

--enable-libstdcxx-time=yes --enable-gnu-unique-object --enable-plugin

--with-system-zlib --enable-objc-gc --with-cloog --enable-cloog-backend=ppl

--disable-cloog-version-check --disable-ppl-version-check --enable-multiarch

--disable-werror --with-arch-32=i686 --with-abi=m64

--with-multilib-list=m32,m64,mx32 --with-tune=generic --enable-checking=release

--build=x86_64-linux-gnu --host=x86_64-linux-gnu --target=x86_64-linux-gnu

Thread model: posix

gcc version 4.7.3 (Ubuntu/Linaro 4.7.3-1ubuntu1)



$ gcc -march=native -Q --help=target

The following options are target specific:

  -m128bit-long-double                [disabled]

  -m32                                [disabled]

  -m3dnow                             [disabled]

  -m3dnowa                            [disabled]

  -m64                                [enabled]

  -m80387                             [enabled]

  -m8bit-idiv                         [disabled]

  -m96bit-long-double                 [enabled]

  -mabi=                              sysv

  -mabm                               [disabled]

  -maccumulate-outgoing-args          [disabled]

  -maddress-mode=                     short

  -maes                               [disabled]

  -malign-double                      [disabled]

  -malign-functions=                  0

  -malign-jumps=                      0

  -malign-loops=                      0

  -malign-stringops                   [enabled]

  -mandroid                           [disabled]

  -march=                             corei7

  -masm=                              att

  -mavx                               [disabled]

  -mavx2                              [disabled]

  -mavx256-split-unaligned-load     [disabled]

  -mavx256-split-unaligned-store     [disabled]

  -mbionic                            [disabled]

  -mbmi                               [disabled]

  -mbmi2                              [disabled]

  -mbranch-cost=                      0

  -mcld                               [disabled]

  -mcmodel=                           32

  -mcpu=                              

  -mcrc32                             [disabled]

  -mcx16                              [enabled]

  -mdispatch-scheduler                [disabled]

  -mf16c                              [disabled]

  -mfancy-math-387                    [enabled]

  -mfentry                            [enabled]

  -mfma                               [disabled]

  -mfma4                              [disabled]

  -mforce-drap                        [disabled]

  -mfp-ret-in-387                     [enabled]

  -mfpmath=                           387

  -mfsgsbase                          [disabled]

  -mfused-madd                        

  -mglibc                             [enabled]

  -mhard-float                        [enabled]

  -mieee-fp                           [enabled]

  -mincoming-stack-boundary=          0

  -minline-all-stringops              [disabled]

  -minline-stringops-dynamically     [disabled]

  -mintel-syntax                      

  -mlarge-data-threshold=             0x10000

  -mlwp                               [disabled]

  -mlzcnt                             [disabled]

  -mmmx                               [disabled]

  -mmovbe                             [disabled]

  -mms-bitfields                      [disabled]

  -mno-align-stringops                [disabled]

  -mno-fancy-math-387                 [disabled]

  -mno-push-args                      [disabled]

  -mno-red-zone                       [disabled]

  -mno-sse4                           [disabled]

  -momit-leaf-frame-pointer           [disabled]

  -mpc32                              [disabled]

  -mpc64                              [disabled]

  -mpc80                              [disabled]

  -mpclmul                            [disabled]

  -mpopcnt                            [enabled]

  -mprefer-avx128                     [disabled]

  -mpreferred-stack-boundary=         0

  -mpush-args                         [enabled]

  -mrdrnd                             [disabled]

  -mrecip                             [disabled]

  -mrecip=                            

  -mred-zone                          [enabled]

  -mregparm=                          0

  -mrtd                               [disabled]

  -msahf                              [enabled]

  -msoft-float                        [disabled]

  -msse                               [enabled]

  -msse2                              [enabled]

  -msse2avx                           [disabled]

  -msse3                              [enabled]

  -msse4                              [enabled]

  -msse4.1                            [enabled]

  -msse4.2                            [enabled]

  -msse4a                             [disabled]

  -msse5                              

  -msseregparm                        [disabled]

  -mssse3                             [enabled]

  -mstack-arg-probe                   [disabled]

  -mstackrealign                      [enabled]

  -mstringop-strategy=                [default]

  -mtbm                               [disabled]

  -mtls-dialect=                      gnu

  -mtls-direct-seg-refs               [enabled]

  -mtune=                             corei7

  -muclibc                            [disabled]

  -mveclibabi=                        [default]

  -mvect8-ret-in-mem                  [disabled]

  -mvzeroupper                        [disabled]

  -mx32                               [disabled]

  -mxop                               [disabled]



  Known assembler dialects (for use with the -masm-dialect= option):

    att intel



  Known ABIs (for use with the -mabi= option):

    ms sysv



  Known code models (for use with the -mcmodel= option):

    32 kernel large medium small



  Valid arguments to -mfpmath=:

    387 387+sse 387,sse both sse sse+387 sse,387



  Known vectorization library ABIs (for use with the -mveclibabi= option):

    acml svml



  Known address mode (for use with the -maddress-mode= option):

    long short



  Valid arguments to -mstringop-strategy=:

    byte_loop libcall loop rep_4byte rep_8byte rep_byte unrolled_loop



  Known TLS dialects (for use with the -mtls-dialect= option):
    gnu gnu2

[Bug tree-optimization/57204] New: Auto-vectorization in nested loops with non-varying indexed array access results in very poor performance (worse than no auto-vectorization)

Reply via email to