#include <stdlib.h> void dupa() { double* wagi; unsigned int i,synapsy=100; wagi = (double*)malloc(100*synapsy); for( i=0;i<synapsy;i++ ) { wagi[i] = 0; } } Simple test case, if compiled with 4.0 gcc-4.0 (GCC) 4.0.0 20050212 (experimental) g++-4.0 -pedantic --save-temps -ftree-vectorize -O3 -Wall -mtune=pentium3 -c test.c essencialy I get: .LFB15: pushl %ebp .LCFI0: movl %esp, %ebp .LCFI1: subl $8, %esp .LCFI2: movl $10000, (%esp) call malloc movl $1, %edx .p2align 4,,15 .L2: xorl %ecx, %ecx movl %ecx, -8(%eax,%edx,8) xorl %ecx, %ecx movl %ecx, -4(%eax,%edx,8) incl %edx cmpl $101, %edx jne .L2 leave ret so xor on ecx is executed twice! inside the loop. Looks simmilar with 3.4 L5: movl $0, (%eax,%edx,8) xorl %ecx, %ecx movl %ecx, 4(%eax,%edx,8) incl %edx cmpl $100, %edx jb .L5 on ultrasparc: .LLFB18: save %sp, -104, %sp .LLCFI0: sethi %hi(9216), %o0 call malloc, 0 or %o0, 784, %o0 mov 0, %g1 .LL2: add %g1, %o0, %g2 add %g1, 8, %g1 st %g0, [%g2] cmp %g1, 800 bne .LL2 st %g0, [%g2+4] jmp %i7+8 restore It's odd because I do specify -O3 just to make sure code will be as fast as possible :) -O0 uses float point instructions to zero it, that's extremly slow than. and -O1 uses float point too, but code is 3x smaller and neater: fldz .L2: fstl -8(%eax,%edx,8) incl %edx cmpl $101, %edx jne .L2 fstp %st(0)
-- Summary: xor is enclosed in loop, and exectuted on each iteration of for statement Product: gcc Version: 4.0.0 Status: UNCONFIRMED Severity: normal Priority: P2 Component: c++ AssignedTo: unassigned at gcc dot gnu dot org ReportedBy: gj at pointblue dot com dot pl CC: gcc-bugs at gcc dot gnu dot org GCC build triplet: i686 GCC host triplet: i686 GCC target triplet: i686 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=19922