http://gcc.gnu.org/bugzilla/show_bug.cgi?id=59857
Bug ID: 59857 Summary: 4.8.2 loop optimization is worse than 4.5.1 under ARM Product: gcc Version: 4.8.2 Status: UNCONFIRMED Severity: enhancement Priority: P3 Component: rtl-optimization Assignee: unassigned at gcc dot gnu.org Reporter: xuelingko at yahoo dot com.tw I compile a simple source code, memread.c, by gcc 4.8.2 and 4.5.1 The C code is: int TEST_Memread(ulv * pSrc, unsigned int nCount) { unsigned int val; ulv *p1 = NULL; unsigned int i; p1 = (ulv *) pSrc; for (i = 0; i < nCount; i++) val = *p1++; return 10; } # gcc -Wall -O2 -static -g -gstabs+ -c memread.c 4.8.2: Target: armv7a Configured with: ../gcc-4.8.2/configure --prefix=/tmp/root/usr/toolchain-4.8.2-vfp/cortex-a7/gcc --host=x86_64-pc-linux-gnu --build=x86_64-pc-linux-gnu --target=armv7a-mediatek-linux-gnueabi --with-sysroot=/tmp/root/usr/toolchain-4.8.2-vfp/cortex-a7/gcc/sysroot --with-arch=armv7-a --with-tune=cortex-a7 --with-cpu=cortex-a7 --with-interwork --with-fpu=vfpv4-d16 --with-float=softfp --with-gnu-as --with-gnu-ld --disable-nls --enable-shared --enable-__cxa_atexit --disable-multilib --enable-c99 --enable-long-long --enable-threads=posix --enable-languages=c,c++ --with-gmp=/tmp/root/build/x86_64 --with-mpfr=/tmp/root/build/x86_64 --with-cloog=/tmp/root/build/x86_64 --with-isl=/tmp/root/build/x86_64 --with-libelf=/tmp/root/build/x86_64 --program-transform-name='s,^,armv7a_001_vfp-linux-gnueabi-,' --with-mpc=/tmp/root/build/x86_64 --enable-lto --without-system-libunwind --disable-rpath --with-host-libstdcxx='-static-libgcc -Wl,-Bstatic,-lstdc++,-Bdynamic,-lm' --with-specs='%{!fno-unwind-tables:-funwind-tables}' --with-build-time-tools=/tmp/root/usr/toolchain-4.8.2-vfp/cortex-a7/binutils/armv7a/bin --enable-cxx-flags='-g -O2' Thread model: posix gcc version 4.8.2 20131014 (prerelease) (Linaro GCC 4.8-2013.10) Target: armv7a Configured with: ../gcc-4.5.1/configure --prefix=/tmp/root/usr/toolchain-4.5.1-vfp/cortex-a9/gcc --host=i686-pc-linux-gnu --target=armv7a --with-sysroot=/tmp/root/usr/toolchain-4.5.1-vfp/cortex-a9/gcc/sysroot --with-arch=armv7-a --with-tune=cortex-a9 --with-cpu=cortex-a9 --with-interwork --with-fpu=vfp --with-float=softfp --with-gnu-as --with-gnu-ld --disable-nls --enable-shared --enable-__cxa_atexit --disable-multilib --enable-c99 --enable-long-long --enable-threads=posix --enable-languages=c,c++ --with-gmp=/tmp/root/build/i686 --with-mpfr=/tmp/root/build/i686 --with-ppl=/tmp/root/build/i686 --with-cloog=/tmp/root/build/i686 --with-libelf=/tmp/root/build/i686 --program-transform-name='s,^,armv7a-,' --with-mpc=/tmp/root/build/i686 --enable-lto --without-system-libunwind --disable-rpath --with-host-libstdcxx='-static-libgcc -Wl,-Bstatic,-lstdc++,-Bdynamic,-lm' --with-specs='%{!fno-unwind-tables:-funwind-tables}' --with-build-time-tools=/tmp/root/usr/toolchain-4.5.1-vfp/cortex-a9/binutils/armv7a/bin/ --enable-cxx-flags='-g -O2' Thread model: posix gcc version 4.5.1 (GCC) The objdump of 4.8.2 is Disassembly of section .text: 00000000 <TEST_Memread>: unsigned int val; ulv *p1 = NULL; unsigned int i; p1 = (ulv *) pSrc; for (i = 0; i < nCount; i++) 0: e3510000 cmp r1, #0 4: 0a000005 beq 20 <TEST_Memread+0x20> 8: e3a03000 mov r3, #0 val = *p1++; c: e5902000 ldr r2, [r0] unsigned int val; ulv *p1 = NULL; unsigned int i; p1 = (ulv *) pSrc; for (i = 0; i < nCount; i++) 10: e2833001 add r3, r3, #1 14: e1530001 cmp r3, r1 val = *p1++; 18: e2800004 add r0, r0, #4 unsigned int val; ulv *p1 = NULL; unsigned int i; p1 = (ulv *) pSrc; for (i = 0; i < nCount; i++) 1c: 1afffffa bne c <TEST_Memread+0xc> val = *p1++; return 10; } 20: e3a0000a mov r0, #10 24: e12fff1e bx lr The objdump of 4.5.1 is Disassembly of section .text: 00000000 <TEST_Memread>: unsigned int val; ulv *p1 = NULL; unsigned int i; p1 = (ulv *) pSrc; for (i = 0; i < nCount; i++) 0: e3510000 cmp r1, #0 4: 0a000004 beq 1c <TEST_Memread+0x1c> 8: e3a03000 mov r3, #0 c: e2833001 add r3, r3, #1 val = *p1++; 10: e4902004 ldr r2, [r0], #4 unsigned int val; ulv *p1 = NULL; unsigned int i; p1 = (ulv *) pSrc; for (i = 0; i < nCount; i++) 14: e1510003 cmp r1, r3 18: 8afffffb bhi c <TEST_Memread+0xc> val = *p1++; return 10; } 1c: e3a0000a mov r0, #10 20: e12fff1e bx lr The main different between them is 4.8.2: c: e5902000 ldr r2, [r0] 18: e2800004 add r0, r0, #4 4.5.1 10: e4902004 ldr r2, [r0], #4 For this loop performance example, 4.8.2 is only 80% of 4.5.1, this make the memory read result is bad when using 4.8.2.