https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89670
--- Comment #9 from Andrew Pinski <pinskia at gcc dot gnu.org> ---
(In reply to Jörn Engel from comment #6)
> True for one, but not the other.
>
> return mask ? __builtin_ctz(mask) : 32;
> 1099: 83 f6 ff xor $0xffffffff,%esi
> 109c: 74 47 je 10e5 <main+0x85>
> 109e: f3 0f bc f6 tzcnt %esi,%esi
>
But this is because of jump threading:
int ml = matchlen32(src, src + 1);
if (ml >= 32)
ml += matchlen32(src + 32, src + 1 + 32);
Does optimize to the correct thing (only one jump rather than 2):
.cfi_startproc
vmovdqu 1(%rdi), %ymm0
vpcmpeqd %ymm1, %ymm1, %ymm1
vpcmpeqb (%rdi), %ymm0, %ymm0
vpandn %ymm1, %ymm0, %ymm0
vpmovmskb %ymm0, %eax
testl %eax, %eax
jne .L15
vmovdqu 32(%rdi), %ymm0
xorl %eax, %eax
vpcmpeqb 33(%rdi), %ymm0, %ymm0
vpandn %ymm1, %ymm0, %ymm0
vpmovmskb %ymm0, %edx
tzcntl %edx, %eax
addl $32, %eax
testl %edx, %edx
movl $64, %edx
cmove %edx, %eax
vzeroupper
ret
The other one:
.LFB4795:
.cfi_startproc
vmovdqu 1(%rdi), %ymm0
vpcmpeqd %ymm1, %ymm1, %ymm1
vpcmpeqb (%rdi), %ymm0, %ymm0
vpandn %ymm1, %ymm0, %ymm0
vpmovmskb %ymm0, %eax
testl %eax, %eax
je .L5
tzcntl %eax, %eax
cmpl $29, %eax
jle .L7
.L2:
vmovdqu 32(%rdi), %ymm0
vpcmpeqd %ymm1, %ymm1, %ymm1
vpcmpeqb 33(%rdi), %ymm0, %ymm0
vpandn %ymm1, %ymm0, %ymm0
vpmovmskb %ymm0, %edx
tzcntl %edx, %edx
addl %edx, %eax
.L7:
vzeroupper
ret
.p2align 4,,10
.p2align 3
.L5:
movl $32, %eax
jmp .L2
.cfi_endproc
Is due to jump threading too, notice how after the test against 0 is jumping to
L5 and then past the comparison again >= 29 :).