On 4/26/19 10:43 AM, Alexander Monakov wrote:
> On Fri, 26 Apr 2019, Martin Liška wrote:
> 
>> I'm suggesting to adjust that to:
>> - -Os will keep using rep-scasb as -Os means optimize for size
>>  no matter what speed impact is
> 
> I'm not sure it's a good choice, the inline sequence is
> 
>         xorl    %eax, %eax
>         orq     $-1, %rcx
>         repnz scasb
>         notq    %rcx
>         decq    %rcx
> 
> compared to simply
> 
>       call    strlen
> 
> it's not even shorter.

Then I'm very open to adjust that even for -Os to 'call strlen'.

> 
>> - otherwise use call to strlen
>> - when -minline-all-stringops is enabled and -O2+ is used, then
>>   expand to 4B loop for all possible alignments
> 
> But it's not OK to use misaligned loads, because overreading past
> end of string may cross a page boundary and cause a segfault.

Yes, the code is handling that correctly by doing a prologue, see bellow:

> What does
> your patch generate for the testcase at -O2 -minline-all-stringops?

$ cat strlen-O2.c
unsigned foo_1B (char *ptr)
{
  return __builtin_strlen (ptr);
}

unsigned foo_4B (char *ptr)
{
  return __builtin_strlen (__builtin_assume_aligned (ptr, 4));
}

$ gcc strlen-O2.c -O2 -S -o/dev/stdout -minline-all-stringops
...
foo_1B:
.LFB0:
        .cfi_startproc
        movq    %rdi, %rdx
        movq    %rdi, %rax
        andl    $3, %edx
        je      .L10
        cmpq    $2, %rdx
        je      .L5
        ja      .L4
        cmpb    $0, (%rdi)
        je      .L3
        leaq    1(%rdi), %rax
.L5:
        cmpb    $0, (%rax)
        je      .L3
        addq    $1, %rax
.L4:
        cmpb    $0, (%rax)
        je      .L3
        addq    $1, %rax
        .p2align 4,,10
        .p2align 3
.L10:
        movl    (%rax), %ecx
        addq    $4, %rax
        leal    -16843009(%rcx), %edx
        notl    %ecx
        andl    %ecx, %edx
        andl    $-2139062144, %edx
        je      .L10
        movl    %edx, %ecx
        shrl    $16, %ecx
        testl   $32896, %edx
        cmove   %ecx, %edx
        leaq    2(%rax), %rcx
        cmove   %rcx, %rax
        movl    %edx, %esi
        addb    %dl, %sil
        sbbq    $3, %rax
.L3:
        subq    %rdi, %rax
        ret
...
foo_4B:
.LFB1:
        .cfi_startproc
        movq    %rdi, %rax
.L16:
        movl    (%rax), %ecx
        addq    $4, %rax
        leal    -16843009(%rcx), %edx
        notl    %ecx
        andl    %ecx, %edx
        andl    $-2139062144, %edx
        je      .L16
        movl    %edx, %ecx
        shrl    $16, %ecx
        testl   $32896, %edx
        cmove   %ecx, %edx
        leaq    2(%rax), %rcx
        cmove   %rcx, %rax
        movl    %edx, %esi
        addb    %dl, %sil
        sbbq    $3, %rax
        subq    %rdi, %rax
        ret

Martin

> 
> Alexander
> 

Reply via email to