https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89670

--- Comment #9 from Andrew Pinski <pinskia at gcc dot gnu.org> ---
(In reply to Jörn Engel from comment #6)
> True for one, but not the other.
> 
>         return mask ? __builtin_ctz(mask) : 32;
>     1099:       83 f6 ff                xor    $0xffffffff,%esi
>     109c:       74 47                   je     10e5 <main+0x85>
>     109e:       f3 0f bc f6             tzcnt  %esi,%esi
> 

But this is because of jump threading:

        int ml = matchlen32(src, src + 1);
        if (ml >= 32)
                ml += matchlen32(src + 32, src + 1 + 32);

Does optimize to the correct thing (only one jump rather than 2):
        .cfi_startproc
        vmovdqu 1(%rdi), %ymm0
        vpcmpeqd        %ymm1, %ymm1, %ymm1
        vpcmpeqb        (%rdi), %ymm0, %ymm0
        vpandn  %ymm1, %ymm0, %ymm0
        vpmovmskb       %ymm0, %eax
        testl   %eax, %eax
        jne     .L15
        vmovdqu 32(%rdi), %ymm0
        xorl    %eax, %eax
        vpcmpeqb        33(%rdi), %ymm0, %ymm0
        vpandn  %ymm1, %ymm0, %ymm0
        vpmovmskb       %ymm0, %edx
        tzcntl  %edx, %eax
        addl    $32, %eax
        testl   %edx, %edx
        movl    $64, %edx
        cmove   %edx, %eax
        vzeroupper
        ret

The other one:
.LFB4795:
        .cfi_startproc
        vmovdqu 1(%rdi), %ymm0
        vpcmpeqd        %ymm1, %ymm1, %ymm1
        vpcmpeqb        (%rdi), %ymm0, %ymm0
        vpandn  %ymm1, %ymm0, %ymm0
        vpmovmskb       %ymm0, %eax
        testl   %eax, %eax
        je      .L5
        tzcntl  %eax, %eax
        cmpl    $29, %eax
        jle     .L7
.L2:
        vmovdqu 32(%rdi), %ymm0
        vpcmpeqd        %ymm1, %ymm1, %ymm1
        vpcmpeqb        33(%rdi), %ymm0, %ymm0
        vpandn  %ymm1, %ymm0, %ymm0
        vpmovmskb       %ymm0, %edx
        tzcntl  %edx, %edx
        addl    %edx, %eax
.L7:
        vzeroupper
        ret
        .p2align 4,,10
        .p2align 3
.L5:
        movl    $32, %eax
        jmp     .L2
        .cfi_endproc

Is due to jump threading too, notice how after the test against 0 is jumping to
L5 and then past the comparison again >= 29 :).

Reply via email to