http://gcc.gnu.org/bugzilla/show_bug.cgi?id=56127
Bug #: 56127 Summary: Incorrect code with -O2 Classification: Unclassified Product: gcc Version: 4.6.3 Status: UNCONFIRMED Severity: normal Priority: P3 Component: c++ AssignedTo: unassig...@gcc.gnu.org ReportedBy: trosenb...@gmail.com Created attachment 29290 --> http://gcc.gnu.org/bugzilla/attachment.cgi?id=29290 main.ii It seems like GCC 4.6.3 generates incorrect assembly for ARM Cortex-A9 from the following C++ code when invoked with -O1, -O2 and -O3 The code just increments two variables 100 times and divides them. Correct output is: average = 1.000 PMTsum = 100 nValues = 100 but -O1 gives this: average = 0.000 PMTsum = 100 nValues = 100 and -O2 and -O3 yield this: average = inf PMTsum = 100 nValues = 100 This 3-file C++ project is the minimal configuration that shows this behavior. With everything in one file the problem vanishes. Below the C++ code is the assembly language output, marked with *** where I think the error may be (but I'm unfamiliar with assembly language). Thanks to anyone who looks into this, and to everyone who has been developing GCC! //exp_results.h class exp_results { public: exp_results() : PMTsum(0), nValues(0) { } unsigned increment(); double get_average() { if (nValues) return ((double)PMTsum) / ((double)nValues); else return 0; } unsigned PMTsum; unsigned nValues; }; //exp_results.cpp #include "exp_results.h" unsigned exp_results::increment() { PMTsum++; nValues++; return 1; } //main.cc #include <stdio.h> #include "exp_results.h" int main() { exp_results r; for(unsigned i=0; i< 100; i++) r.increment(); printf("average = %9.3f\n", r.get_average()); printf("PMTsum = %u nValues = %u\n", r.PMTsum, r.nValues); return 0; } // main.s .cpu cortex-a9 .eabi_attribute 27, 3 .fpu neon-fp16 .eabi_attribute 20, 1 .eabi_attribute 21, 1 .eabi_attribute 23, 3 .eabi_attribute 24, 1 .eabi_attribute 25, 1 .eabi_attribute 26, 1 .eabi_attribute 30, 2 .eabi_attribute 34, 1 .eabi_attribute 18, 4 .file "main.cc" @ GNU C++ (Sourcery CodeBench Lite 2012.03-83) version 4.6.3 (arm-xilinx-eabi) @ compiled by GNU C version 4.3.2, GMP version 4.3.2, MPFR version 3.0.1-p4, MPC version 0.9 @ GGC heuristics: --param ggc-min-expand=100 --param ggc-min-heapsize=131072 @ options passed: -fpreprocessed main.ii -mcpu=cortex-a9 @ -mfloat-abi=softfp -mfpu=neon-fp16 -auxbase-strip src/main.o -O2 -Wall @ -fmessage-length=0 -fverbose-asm -fremove-local-statics @ options enabled: -fauto-inc-dec -fbranch-count-reg -fcaller-saves @ -fcombine-stack-adjustments -fcommon -fcompare-elim -fcprop-registers @ -fcrossjumping -fcse-follow-jumps -fdefer-pop @ -fdelete-null-pointer-checks -fdevirtualize -fdwarf2-cfi-asm @ -fearly-inlining -feliminate-unused-debug-types -fexceptions @ -fexpensive-optimizations -fextension-elimination -fforward-propagate @ -ffunction-cse -fgcse -fgcse-lm -fguess-branch-probability -fident @ -fif-conversion -fif-conversion2 -findirect-inlining -finline @ -finline-functions-called-once -finline-small-functions -fipa-cp @ -fipa-profile -fipa-pure-const -fipa-reference -fipa-sra @ -fira-share-save-slots -fira-share-spill-slots -fivopts @ -fkeep-static-consts -fleading-underscore -fmath-errno -fmerge-constants @ -fmerge-debug-strings -fmove-loop-invariants -fomit-frame-pointer @ -foptimize-register-move -foptimize-sibling-calls -fpartial-inlining @ -fpeephole -fpeephole2 -fprefetch-loop-arrays -fpromote-loop-indices @ -freg-struct-return -fregmove -frename-registers -freorder-blocks @ -freorder-functions -frerun-cse-after-loop @ -fsched-critical-path-heuristic -fsched-dep-count-heuristic @ -fsched-group-heuristic -fsched-interblock -fsched-last-insn-heuristic @ -fsched-rank-heuristic -fsched-spec -fsched-spec-insn-heuristic @ -fsched-stalled-insns-dep -fschedule-insns -fschedule-insns2 @ -fsection-anchors -fshow-column -fsigned-zeros -fsplit-ivs-in-unroller @ -fsplit-wide-types -fstrict-aliasing -fstrict-overflow @ -fstrict-volatile-bitfields -fthread-jumps -ftoplevel-reorder @ -ftrapping-math -ftree-bit-ccp -ftree-builtin-call-dce -ftree-ccp @ -ftree-ch -ftree-copy-prop -ftree-copyrename -ftree-cselim -ftree-dce @ -ftree-dominator-opts -ftree-dse -ftree-forwprop -ftree-fre @ -ftree-if-to-switch-conversion -ftree-loop-if-convert -ftree-loop-im @ -ftree-loop-ivcanon -ftree-loop-optimize -ftree-parallelize-loops= @ -ftree-phiprop -ftree-pre -ftree-pta -ftree-reassoc -ftree-scev-cprop @ -ftree-sink -ftree-slp-vectorize -ftree-sra -ftree-switch-conversion @ -ftree-ter -ftree-vect-loop-version -ftree-vrp -funit-at-a-time @ -funroll-loops -fverbose-asm -fweb -fzero-initialized-in-bss @ -mlittle-endian -msched-prolog -munaligned-access @ Compiler executable checksum: af2616fad9f2abb21c14f2e52d2eaee7 .section .text.startup,"ax",%progbits .align 2 .global main .type main, %function main: .fnstart .LFB4: @ args = 0, pretend = 0, frame = 8 @ frame_needed = 0, uses_anonymous_args = 0 stmfd sp!, {r4, lr} @, .save {r4, lr} mov r4, #100 @ ivtmp.3, .pad #8 sub sp, sp, #8 @,, mov r3, #0 @ tmp144, str r3, [sp, #0] @ tmp144, r.PMTsum str r3, [sp, #4] @ tmp144, r.nValues .L3: mov r0, sp @, sub r4, r4, #1 @ tmp156, ivtmp.3, bl _ZN11exp_results9incrementEv @ mov r0, sp @, bl _ZN11exp_results9incrementEv @ subs r4, r4, #1 @ ivtmp.3, tmp156, bne .L3 @, ldr r3, [sp, #4] @ D.6224, r.nValues movw r0, #:lower16:.LC0 @, movt r0, #:upper16:.LC0 @, cmp r3, #0 @ D.6224, fmsrne s15, r3 @ int @, D.6224 fuitodne d16, s15 @ tmp149, fldsne s15, [sp, #0] @ int @, r.PMTsum vmov.i32 d16, #0 @ D.6219 //*** load 0 into d16 (why? is this the bug?) *** fuitodne d17, s15 @ tmp147, fdivdne d16, d17, d16 @ D.6219, tmp147, tmp149 fmrrd r2, r3, d16 @, D.6219 bl printf @ movw r0, #:lower16:.LC1 @, ldmia sp, {r1, r2} @,, movt r0, #:upper16:.LC1 @, bl printf @ mov r0, #0 @, add sp, sp, #8 @,, ldmfd sp!, {r4, pc} .fnend .size main, .-main .section .rodata.str1.4,"aMS",%progbits,1 .align 2 .LC0: .ascii "average = %9.3f\012\000" .space 3 .LC1: .ascii "PMTsum = %u nValues = %u\012\000" .ident "GCC: (Sourcery CodeBench Lite 2012.03-83) 4.6.3" // exp_results.s .cpu cortex-a9 .eabi_attribute 27, 3 .fpu neon-fp16 .eabi_attribute 20, 1 .eabi_attribute 21, 1 .eabi_attribute 23, 3 .eabi_attribute 24, 1 .eabi_attribute 25, 1 .eabi_attribute 26, 1 .eabi_attribute 30, 2 .eabi_attribute 34, 1 .eabi_attribute 18, 4 .file "exp_results.cpp" @ GNU C++ (Sourcery CodeBench Lite 2012.03-83) version 4.6.3 (arm-xilinx-eabi) @ compiled by GNU C version 4.3.2, GMP version 4.3.2, MPFR version 3.0.1-p4, MPC version 0.9 @ GGC heuristics: --param ggc-min-expand=100 --param ggc-min-heapsize=131072 @ options passed: -fpreprocessed exp_results.ii -mcpu=cortex-a9 @ -mfloat-abi=softfp -mfpu=neon-fp16 -auxbase-strip src/exp_results.o -O2 @ -Wall -fmessage-length=0 -fverbose-asm -fremove-local-statics @ options enabled: -fauto-inc-dec -fbranch-count-reg -fcaller-saves @ -fcombine-stack-adjustments -fcommon -fcompare-elim -fcprop-registers @ -fcrossjumping -fcse-follow-jumps -fdefer-pop @ -fdelete-null-pointer-checks -fdevirtualize -fdwarf2-cfi-asm @ -fearly-inlining -feliminate-unused-debug-types -fexceptions @ -fexpensive-optimizations -fextension-elimination -fforward-propagate @ -ffunction-cse -fgcse -fgcse-lm -fguess-branch-probability -fident @ -fif-conversion -fif-conversion2 -findirect-inlining -finline @ -finline-functions-called-once -finline-small-functions -fipa-cp @ -fipa-profile -fipa-pure-const -fipa-reference -fipa-sra @ -fira-share-save-slots -fira-share-spill-slots -fivopts @ -fkeep-static-consts -fleading-underscore -fmath-errno -fmerge-constants @ -fmerge-debug-strings -fmove-loop-invariants -fomit-frame-pointer @ -foptimize-register-move -foptimize-sibling-calls -fpartial-inlining @ -fpeephole -fpeephole2 -fprefetch-loop-arrays -fpromote-loop-indices @ -freg-struct-return -fregmove -frename-registers -freorder-blocks @ -freorder-functions -frerun-cse-after-loop @ -fsched-critical-path-heuristic -fsched-dep-count-heuristic @ -fsched-group-heuristic -fsched-interblock -fsched-last-insn-heuristic @ -fsched-rank-heuristic -fsched-spec -fsched-spec-insn-heuristic @ -fsched-stalled-insns-dep -fschedule-insns -fschedule-insns2 @ -fsection-anchors -fshow-column -fsigned-zeros -fsplit-ivs-in-unroller @ -fsplit-wide-types -fstrict-aliasing -fstrict-overflow @ -fstrict-volatile-bitfields -fthread-jumps -ftoplevel-reorder @ -ftrapping-math -ftree-bit-ccp -ftree-builtin-call-dce -ftree-ccp @ -ftree-ch -ftree-copy-prop -ftree-copyrename -ftree-cselim -ftree-dce @ -ftree-dominator-opts -ftree-dse -ftree-forwprop -ftree-fre @ -ftree-if-to-switch-conversion -ftree-loop-if-convert -ftree-loop-im @ -ftree-loop-ivcanon -ftree-loop-optimize -ftree-parallelize-loops= @ -ftree-phiprop -ftree-pre -ftree-pta -ftree-reassoc -ftree-scev-cprop @ -ftree-sink -ftree-slp-vectorize -ftree-sra -ftree-switch-conversion @ -ftree-ter -ftree-vect-loop-version -ftree-vrp -funit-at-a-time @ -funroll-loops -fverbose-asm -fweb -fzero-initialized-in-bss @ -mlittle-endian -msched-prolog -munaligned-access @ Compiler executable checksum: af2616fad9f2abb21c14f2e52d2eaee7 .text .align 2 .global _ZN11exp_results9incrementEv .type _ZN11exp_results9incrementEv, %function _ZN11exp_results9incrementEv: .fnstart .LFB4: @ args = 0, pretend = 0, frame = 0 @ frame_needed = 0, uses_anonymous_args = 0 @ link register save eliminated. mov r3, r0 @ this, this mov r0, #1 @, ldmia r3, {r1, r2} @ this,, add r1, r1, r0 @ tmp141, this_1(D)->PMTsum, add r2, r2, r0 @ tmp143, this_1(D)->nValues, stmia r3, {r1, r2} @ this,, bx lr @ .cantunwind .fnend .size _ZN11exp_results9incrementEv, .-_ZN11exp_results9incrementEv .ident "GCC: (Sourcery CodeBench Lite 2012.03-83) 4.6.3"