[Bug rtl-optimization/59857] New: 4.8.2 loop optimization is worse than 4.5.1 under ARM

xuelingko at yahoo dot com.tw Fri, 17 Jan 2014 03:27:51 -0800

http://gcc.gnu.org/bugzilla/show_bug.cgi?id=59857


            Bug ID: 59857
           Summary: 4.8.2 loop optimization is worse than 4.5.1 under ARM
           Product: gcc
           Version: 4.8.2
            Status: UNCONFIRMED
          Severity: enhancement
          Priority: P3
         Component: rtl-optimization
          Assignee: unassigned at gcc dot gnu.org
          Reporter: xuelingko at yahoo dot com.tw

I compile a simple source code, memread.c, by gcc 4.8.2 and 4.5.1

The C code is:
int TEST_Memread(ulv * pSrc, unsigned int nCount)
{
    unsigned int val;
    ulv *p1 = NULL;
    unsigned int i;

    p1 = (ulv *) pSrc;
    for (i = 0; i < nCount; i++)
        val = *p1++;

    return 10;
}

# gcc -Wall -O2 -static -g -gstabs+ -c memread.c

4.8.2:
Target: armv7a
Configured with: ../gcc-4.8.2/configure
--prefix=/tmp/root/usr/toolchain-4.8.2-vfp/cortex-a7/gcc
--host=x86_64-pc-linux-gnu --build=x86_64-pc-linux-gnu
--target=armv7a-mediatek-linux-gnueabi
--with-sysroot=/tmp/root/usr/toolchain-4.8.2-vfp/cortex-a7/gcc/sysroot
--with-arch=armv7-a --with-tune=cortex-a7 --with-cpu=cortex-a7 --with-interwork
--with-fpu=vfpv4-d16 --with-float=softfp --with-gnu-as --with-gnu-ld
--disable-nls --enable-shared --enable-__cxa_atexit --disable-multilib
--enable-c99 --enable-long-long --enable-threads=posix --enable-languages=c,c++
--with-gmp=/tmp/root/build/x86_64 --with-mpfr=/tmp/root/build/x86_64
--with-cloog=/tmp/root/build/x86_64 --with-isl=/tmp/root/build/x86_64
--with-libelf=/tmp/root/build/x86_64
--program-transform-name='s,^,armv7a_001_vfp-linux-gnueabi-,'
--with-mpc=/tmp/root/build/x86_64 --enable-lto --without-system-libunwind
--disable-rpath --with-host-libstdcxx='-static-libgcc
-Wl,-Bstatic,-lstdc++,-Bdynamic,-lm'
--with-specs='%{!fno-unwind-tables:-funwind-tables}'
--with-build-time-tools=/tmp/root/usr/toolchain-4.8.2-vfp/cortex-a7/binutils/armv7a/bin
--enable-cxx-flags='-g -O2'
Thread model: posix
gcc version 4.8.2 20131014 (prerelease) (Linaro GCC 4.8-2013.10)

Target: armv7a
Configured with: ../gcc-4.5.1/configure
--prefix=/tmp/root/usr/toolchain-4.5.1-vfp/cortex-a9/gcc
--host=i686-pc-linux-gnu --target=armv7a
--with-sysroot=/tmp/root/usr/toolchain-4.5.1-vfp/cortex-a9/gcc/sysroot
--with-arch=armv7-a --with-tune=cortex-a9 --with-cpu=cortex-a9 --with-interwork
--with-fpu=vfp --with-float=softfp --with-gnu-as --with-gnu-ld --disable-nls
--enable-shared --enable-__cxa_atexit --disable-multilib --enable-c99
--enable-long-long --enable-threads=posix --enable-languages=c,c++
--with-gmp=/tmp/root/build/i686 --with-mpfr=/tmp/root/build/i686
--with-ppl=/tmp/root/build/i686 --with-cloog=/tmp/root/build/i686
--with-libelf=/tmp/root/build/i686 --program-transform-name='s,^,armv7a-,'
--with-mpc=/tmp/root/build/i686 --enable-lto --without-system-libunwind
--disable-rpath --with-host-libstdcxx='-static-libgcc
-Wl,-Bstatic,-lstdc++,-Bdynamic,-lm'
--with-specs='%{!fno-unwind-tables:-funwind-tables}'
--with-build-time-tools=/tmp/root/usr/toolchain-4.5.1-vfp/cortex-a9/binutils/armv7a/bin/
--enable-cxx-flags='-g -O2'
Thread model: posix
gcc version 4.5.1 (GCC)




The objdump of 4.8.2 is

Disassembly of section .text:

00000000 <TEST_Memread>:
    unsigned int val;
    ulv *p1 = NULL;
    unsigned int i;

    p1 = (ulv *) pSrc;
    for (i = 0; i < nCount; i++)
   0:    e3510000     cmp    r1, #0
   4:    0a000005     beq    20 <TEST_Memread+0x20>
   8:    e3a03000     mov    r3, #0
        val = *p1++;
   c:    e5902000     ldr    r2, [r0]
    unsigned int val;
    ulv *p1 = NULL;
    unsigned int i;

    p1 = (ulv *) pSrc;
    for (i = 0; i < nCount; i++)
  10:    e2833001     add    r3, r3, #1
  14:    e1530001     cmp    r3, r1
        val = *p1++;
  18:    e2800004     add    r0, r0, #4
    unsigned int val;
    ulv *p1 = NULL;
    unsigned int i;

    p1 = (ulv *) pSrc;
    for (i = 0; i < nCount; i++)
  1c:    1afffffa     bne    c <TEST_Memread+0xc>
        val = *p1++;

    return 10;
}
  20:    e3a0000a     mov    r0, #10
  24:    e12fff1e     bx    lr


The objdump of 4.5.1 is

Disassembly of section .text:

00000000 <TEST_Memread>:
    unsigned int val;
    ulv *p1 = NULL;
    unsigned int i;

    p1 = (ulv *) pSrc;
    for (i = 0; i < nCount; i++)
   0:    e3510000     cmp    r1, #0
   4:    0a000004     beq    1c <TEST_Memread+0x1c>
   8:    e3a03000     mov    r3, #0
   c:    e2833001     add    r3, r3, #1
        val = *p1++;
  10:    e4902004     ldr    r2, [r0], #4
    unsigned int val;
    ulv *p1 = NULL;
    unsigned int i;

    p1 = (ulv *) pSrc;
    for (i = 0; i < nCount; i++)
  14:    e1510003     cmp    r1, r3
  18:    8afffffb     bhi    c <TEST_Memread+0xc>
        val = *p1++;

    return 10;
}
  1c:    e3a0000a     mov    r0, #10
  20:    e12fff1e     bx    lr



The main different between them is 
4.8.2:
   c:    e5902000     ldr    r2, [r0]
  18:    e2800004     add    r0, r0, #4
4.5.1
  10:    e4902004     ldr    r2, [r0], #4

For this loop performance example, 4.8.2 is only 80% of 4.5.1, this make the
memory read result is bad when using 4.8.2.

[Bug rtl-optimization/59857] New: 4.8.2 loop optimization is worse than 4.5.1 under ARM

Reply via email to