For this code:
int f(unsigned int *p) {
for (int i = 0; i < 64; ++i)
p[i] = 0;
}
I'd expect to get something like the output for this code:
int f2(unsigned int *p) {
int c = 64*4;
if ((unsigned long) p % 8) *p++ = 0, c -= 4;
unsigned long *l = p;
do *l++ = 0; while ((c -= 8) >= 8);
p = l;
if (c) *p++ = 0;
}
which is
f2:
and $16,7,$1
lda $4,256($31)
beq $1,$L11
stl $31,0($16)
lda $4,252($31)
lda $16,4($16)
$L11:
mov $31,$3
.align 4
$L12:
lda $3,8($3)
stq $31,0($16)
lda $16,8($16)
subl $4,$3,$2
cmple $2,7,$1
beq $1,$L12
beq $2,$L17
stl $31,0($16)
$L17:
ret
but I get:
f:
and $16,4,$1
lda $5,64($31)
lda $6,64($31)
mov $31,$7
cmpult $31,$1,$1
cmplt $1,64,$2
cmovne $2,$1,$5
ble $5,$L4
mov $31,$3
mov $31,$4
.align 4
$L12:
lda $3,1($3)
s4addq $4,$16,$1
addl $31,$3,$4
stl $31,0($1)
addl $31,$4,$2
cmple $5,$2,$1
beq $1,$L12
lda $1,64($31)
mov $2,$7
subl $1,$4,$6
$L4:
cmpeq $5,64,$1
bne $1,$L6
lda $1,64($31)
subq $1,$5,$22
sra $22,1,$4
addq $4,$4,$8
ble $8,$L8
s4addq $5,$16,$2
mov $31,$3
.align 4
$L10:
lda $3,1($3)
stq $31,0($2)
lda $2,8($2)
addl $31,$3,$1
cmple $4,$1,$1
beq $1,$L10
subl $6,$8,$6
addl $7,$8,$7
$L8:
cmpeq $22,$8,$1
bne $1,$L6
mov $31,$2
.align 4
$L14:
addl $2,$7,$1
subl $6,1,$6
lda $2,1($2)
s4addq $1,$16,$1
stl $31,0($1)
bne $6,$L14
$L6:
ret
which is pretty weird and inefficent code...
--
Summary: Inefficient code generated by -ftree-vectorize on Alpha
Product: gcc
Version: 4.0.0
Status: UNCONFIRMED
Keywords: missed-optimization
Severity: normal
Priority: P2
Component: tree-optimization
AssignedTo: unassigned at gcc dot gnu dot org
ReportedBy: falk at debian dot org
CC: gcc-bugs at gcc dot gnu dot org
GCC build triplet: alphaev68-unknown-linux-gnu
GCC host triplet: alphaev68-unknown-linux-gnu
GCC target triplet: alphaev68-unknown-linux-gnu
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=18557