https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89670
--- Comment #9 from Andrew Pinski <pinskia at gcc dot gnu.org> --- (In reply to Jörn Engel from comment #6) > True for one, but not the other. > > return mask ? __builtin_ctz(mask) : 32; > 1099: 83 f6 ff xor $0xffffffff,%esi > 109c: 74 47 je 10e5 <main+0x85> > 109e: f3 0f bc f6 tzcnt %esi,%esi > But this is because of jump threading: int ml = matchlen32(src, src + 1); if (ml >= 32) ml += matchlen32(src + 32, src + 1 + 32); Does optimize to the correct thing (only one jump rather than 2): .cfi_startproc vmovdqu 1(%rdi), %ymm0 vpcmpeqd %ymm1, %ymm1, %ymm1 vpcmpeqb (%rdi), %ymm0, %ymm0 vpandn %ymm1, %ymm0, %ymm0 vpmovmskb %ymm0, %eax testl %eax, %eax jne .L15 vmovdqu 32(%rdi), %ymm0 xorl %eax, %eax vpcmpeqb 33(%rdi), %ymm0, %ymm0 vpandn %ymm1, %ymm0, %ymm0 vpmovmskb %ymm0, %edx tzcntl %edx, %eax addl $32, %eax testl %edx, %edx movl $64, %edx cmove %edx, %eax vzeroupper ret The other one: .LFB4795: .cfi_startproc vmovdqu 1(%rdi), %ymm0 vpcmpeqd %ymm1, %ymm1, %ymm1 vpcmpeqb (%rdi), %ymm0, %ymm0 vpandn %ymm1, %ymm0, %ymm0 vpmovmskb %ymm0, %eax testl %eax, %eax je .L5 tzcntl %eax, %eax cmpl $29, %eax jle .L7 .L2: vmovdqu 32(%rdi), %ymm0 vpcmpeqd %ymm1, %ymm1, %ymm1 vpcmpeqb 33(%rdi), %ymm0, %ymm0 vpandn %ymm1, %ymm0, %ymm0 vpmovmskb %ymm0, %edx tzcntl %edx, %edx addl %edx, %eax .L7: vzeroupper ret .p2align 4,,10 .p2align 3 .L5: movl $32, %eax jmp .L2 .cfi_endproc Is due to jump threading too, notice how after the test against 0 is jumping to L5 and then past the comparison again >= 29 :).