------- Comment #4 from pinskia at gcc dot gnu dot org 2005-11-01 16:17 -------
If we change (*p1)[d-1] to (*p1)[d], we get:
.L2:
movl 8(%edi), %eax
movswl (%eax),%edx
movl 4(%edi), %eax
movswl (%eax),%eax
subl %edx, %eax
movl %eax, %ecx
sarl $31, %ecx
xorl %ecx, %eax
subl %ecx, %eax
movl (%edi), %ecx
movswl (%ecx),%ecx
subl %ecx, %edx
movl %edx, %ecx
sarl $31, %ecx
xorl %ecx, %edx
subl %ecx, %edx
cmpl %edx, %eax
movl 8(%ebp), %edx
setge %al
movzbl %al, %eax
incl %eax
movl %eax, -4(%edx,%esi,4)
incl %esi
movl %eax, 8(%esp)
movsbl (%edi,%eax),%eax
movl %ebx, (%esp)
addl $32, %ebx
movl %eax, 4(%esp)
call bar
addl %eax, -16(%ebp)
cmpl $7, %esi
jne .L2
For the loop, which seems like very good as there are no branches, only
setge %al
movzbl %al, %eax
incl %eax
--
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=24609