On 4/26/19 10:43 AM, Alexander Monakov wrote: > On Fri, 26 Apr 2019, Martin Liška wrote: > >> I'm suggesting to adjust that to: >> - -Os will keep using rep-scasb as -Os means optimize for size >> no matter what speed impact is > > I'm not sure it's a good choice, the inline sequence is > > xorl %eax, %eax > orq $-1, %rcx > repnz scasb > notq %rcx > decq %rcx > > compared to simply > > call strlen > > it's not even shorter.
Then I'm very open to adjust that even for -Os to 'call strlen'. > >> - otherwise use call to strlen >> - when -minline-all-stringops is enabled and -O2+ is used, then >> expand to 4B loop for all possible alignments > > But it's not OK to use misaligned loads, because overreading past > end of string may cross a page boundary and cause a segfault. Yes, the code is handling that correctly by doing a prologue, see bellow: > What does > your patch generate for the testcase at -O2 -minline-all-stringops? $ cat strlen-O2.c unsigned foo_1B (char *ptr) { return __builtin_strlen (ptr); } unsigned foo_4B (char *ptr) { return __builtin_strlen (__builtin_assume_aligned (ptr, 4)); } $ gcc strlen-O2.c -O2 -S -o/dev/stdout -minline-all-stringops ... foo_1B: .LFB0: .cfi_startproc movq %rdi, %rdx movq %rdi, %rax andl $3, %edx je .L10 cmpq $2, %rdx je .L5 ja .L4 cmpb $0, (%rdi) je .L3 leaq 1(%rdi), %rax .L5: cmpb $0, (%rax) je .L3 addq $1, %rax .L4: cmpb $0, (%rax) je .L3 addq $1, %rax .p2align 4,,10 .p2align 3 .L10: movl (%rax), %ecx addq $4, %rax leal -16843009(%rcx), %edx notl %ecx andl %ecx, %edx andl $-2139062144, %edx je .L10 movl %edx, %ecx shrl $16, %ecx testl $32896, %edx cmove %ecx, %edx leaq 2(%rax), %rcx cmove %rcx, %rax movl %edx, %esi addb %dl, %sil sbbq $3, %rax .L3: subq %rdi, %rax ret ... foo_4B: .LFB1: .cfi_startproc movq %rdi, %rax .L16: movl (%rax), %ecx addq $4, %rax leal -16843009(%rcx), %edx notl %ecx andl %ecx, %edx andl $-2139062144, %edx je .L16 movl %edx, %ecx shrl $16, %ecx testl $32896, %edx cmove %ecx, %edx leaq 2(%rax), %rcx cmove %rcx, %rax movl %edx, %esi addb %dl, %sil sbbq $3, %rax subq %rdi, %rax ret Martin > > Alexander >