On Tue, Mar 9, 2010 at 3:58 PM, Basile Starynkevitch
<[email protected]> wrote:
> Hello All,
>
> With a recently compiled gcc-trunk on x86-64/linux, I am compiling the
> folllowing example:
>
> #################
>
> /* file testmanychar.c */
> extern void g (int, char *, char *, char *);
>
> void
> f (void)
> {
> char x0, x1, x2, x3, x4, x5, x6, x7;
> /* assuming x0 is word aligned on a x86_64, and variables are bytes in
> memory, we could clear all the variables in one machine instruction */
> x0 = x1 = x2 = x3 = x4 = x5 = x6 = x7 = (char) 0;
> g (10, &x0, &x1, &x2);
> g (20, &x2, &x3, &x4);
> g (30, &x4, &x5, &x6);
> g (40, &x6, &x7, &x0);
> }
>
> #################
>
> My intuition was that GCC could store x0 on a 64 bits aligned byte, and x1
> immediately after, and so one, and clear all the eight bytes at once using a
> single machine instruction [clearing a 64 bits word].
>
> But this is not the case, since
> gcc-trunk -S -O3 -fverbose-asm testmanychar.c
> gives the following code
>
> #################
> .type f, @function
> f:
> .LFB0:
> .cfi_startproc
> movq %rbx, -24(%rsp) #,
> movq %rbp, -16(%rsp) #,
> movl $10, %edi #,
> movq %r12, -8(%rsp) #,
> subq $40, %rsp #,
> .cfi_def_cfa_offset 48
> leaq 13(%rsp), %rbx #, tmp58
> .cfi_offset 12, -16
> .cfi_offset 6, -24
> .cfi_offset 3, -32
> leaq 15(%rsp), %rbp #, tmp60
> leaq 14(%rsp), %rdx #, tmp59
> leaq 11(%rsp), %r12 #, tmp61
> movb $0, 8(%rsp) #, x7
> movb $0, 9(%rsp) #, x6
> movq %rbx, %rcx # tmp58,
> movq %rbp, %rsi # tmp60,
> movb $0, 10(%rsp) #, x5
> movb $0, 11(%rsp) #, x4
> movb $0, 12(%rsp) #, x3
> movb $0, 13(%rsp) #, x2
> movb $0, 14(%rsp) #, x1
> movb $0, 15(%rsp) #, x0
> call g #
> leaq 12(%rsp), %rdx #, tmp62
> movq %r12, %rcx # tmp61,
> movq %rbx, %rsi # tmp58,
> movl $20, %edi #,
> leaq 9(%rsp), %rbx #, tmp64
> call g #
> leaq 10(%rsp), %rdx #, tmp65
> movq %rbx, %rcx # tmp64,
> movq %r12, %rsi # tmp61,
> movl $30, %edi #,
> call g #
> leaq 8(%rsp), %rdx #, tmp68
> movq %rbp, %rcx # tmp60,
> movq %rbx, %rsi # tmp64,
> movl $40, %edi #,
> call g #
> movq 16(%rsp), %rbx #,
> movq 24(%rsp), %rbp #,
> movq 32(%rsp), %r12 #,
> addq $40, %rsp #,
> .cfi_def_cfa_offset 8
> ret
> .cfi_endproc
> .LFE0:
> .size f, .-f
> .ident "GCC: (GNU) 4.5.0 20100309 (experimental) [trunk revision
> 157303]"
>
> #####################
>
>
> With
> gcc-trunk -S -O3 -fverbose-asm -march=core2 -mtune=core2 testmanychar.c
> I am getting still
>
> ##################
>
> # options passed: testmanychar.c -march=core2 -mtune=core2 -O3
>
> .globl f
> .type f, @function
> f:
> .LFB0:
> .cfi_startproc
> movq %rbx, -24(%rsp) #,
> movq %rbp, -16(%rsp) #,
> movq %r12, -8(%rsp) #,
> movl $10, %edi #,
> subq $40, %rsp #,
> .cfi_def_cfa_offset 48
> leaq 13(%rsp), %rbx #, tmp58
> .cfi_offset 12, -16
> .cfi_offset 6, -24
> .cfi_offset 3, -32
> leaq 15(%rsp), %rbp #, tmp60
> leaq 11(%rsp), %r12 #, tmp61
> leaq 14(%rsp), %rdx #, tmp59
> movq %rbx, %rcx # tmp58,
> movq %rbp, %rsi # tmp60,
> movb $0, 8(%rsp) #, x7
> movb $0, 9(%rsp) #, x6
> movb $0, 10(%rsp) #, x5
> movb $0, 11(%rsp) #, x4
> movb $0, 12(%rsp) #, x3
> movb $0, 13(%rsp) #, x2
> movb $0, 14(%rsp) #, x1
> movb $0, 15(%rsp) #, x0
> call g #
> leaq 12(%rsp), %rdx #, tmp62
> movq %r12, %rcx # tmp61,
> movq %rbx, %rsi # tmp58,
> movl $20, %edi #,
> leaq 9(%rsp), %rbx #, tmp64
> call g #
> leaq 10(%rsp), %rdx #, tmp65
> movq %rbx, %rcx # tmp64,
> movq %r12, %rsi # tmp61,
> movl $30, %edi #,
> call g #
> leaq 8(%rsp), %rdx #, tmp68
> movq %rbp, %rcx # tmp60,
> movq %rbx, %rsi # tmp64,
> movl $40, %edi #,
> call g #
> movq 16(%rsp), %rbx #,
> movq 24(%rsp), %rbp #,
> movq 32(%rsp), %r12 #,
> addq $40, %rsp #,
> .cfi_def_cfa_offset 8
> ret
> .cfi_endproc
> .LFE0:
> .size f, .-f
> .ident "GCC: (GNU) 4.5.0 20100309 (experimental) [trunk revision
> 157303]"
>
> ####
> I was hoping that
> movb $0, 8(%rsp) #, x7
> movb $0, 9(%rsp) #, x6
> movb $0, 10(%rsp) #, x5
> movb $0, 11(%rsp) #, x4
> movb $0, 12(%rsp) #, x3
> movb $0, 13(%rsp) #, x2
> movb $0, 14(%rsp) #, x1
> movb $0, 15(%rsp) #, x0
> could be just something like
> movq $0, 8(%rsp)
> or something similar.
>
> I do realize that such an optimization is difficult to implement...
> (probably messing the register allocator, etc...). Or is the Core2 processor
> sufficient smart to execute exactly as fast a sequence of 8 consecutive byte
> moves as a single 8-byte word move?
>
>
> Regards.
> --
> Basile STARYNKEVITCH http://starynkevitch.net/Basile/
> email: basile<at>starynkevitch<dot>net mobile: +33 6 8501 2359
> 8, rue de la Faiencerie, 92340 Bourg La Reine, France
> *** opinions {are only mines, sont seulement les miennes} ***
>
Thing you're talking about is a kind of vectorization. If you want to
simplify the vectorizing for the compiler you should store your data
in arrays instead of separate variables and use loops to process your
data instead of separate operations. In the following example gcc
vectorized operations only for the 'z' array.
sal...@salmin:~/test$ cat mov16.c
extern void g16(char *x0, char *x1, char *x2, char *x3,
char *x4, char *x5, char *x6, char *x7,
char *x8, char *x9, char *x10, char *x11,
char *x12, char *x13, char *x14, char *x15);
extern void g(char *z);
int main() {
char x0, x1, x2, x3, x4, x5, x6, x7;
char x8, x9, x10, x11, x12, x13, x14, x15;
char y[16];
char z[16];
int i;
x0 = x1 = x2 = x3 = x4 = x5 = x6 = x7 = 0;
x8 = x9 = x10 = x11 = x12 = x13 = x14 = x15 = 0;
g16(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7,
&x8, &x9, &x10, &x11, &x12, &x13, &x14, &x15);
y[0] = y[1] = y[2] = y[3] = y[4] = y[5] = y[6] = y[7] = 0;
y[8] = y[9] = y[10] = y[11] = y[12] = y[13] = y[14] = y[15] = 0;
g(y);
for (i = 0; i < 16; i++)
z[i] = 0;
g(z);
return 0;
}
sal...@salmin:~/test$ gcc -S -O3 mov16.c
sal...@salmin:~/test$ cat mov16.s
.file "mov16.c"
.text
.p2align 4,,15
.globl main
.type main, @function
main:
.LFB0:
.cfi_startproc
subq $136, %rsp
.cfi_def_cfa_offset 144
leaq 112(%rsp), %rax
leaq 124(%rsp), %rcx
leaq 125(%rsp), %rdx
leaq 126(%rsp), %rsi
leaq 122(%rsp), %r9
leaq 123(%rsp), %r8
movq %rax, 72(%rsp)
leaq 113(%rsp), %rax
leaq 127(%rsp), %rdi
movb $0, 120(%rsp)
movb $0, 121(%rsp)
movq %rax, 64(%rsp)
leaq 114(%rsp), %rax
movb $0, 122(%rsp)
movb $0, 123(%rsp)
movb $0, 124(%rsp)
movq %rax, 56(%rsp)
leaq 115(%rsp), %rax
movb $0, 125(%rsp)
movb $0, 126(%rsp)
movb $0, 127(%rsp)
movq %rax, 48(%rsp)
leaq 116(%rsp), %rax
movb $0, 112(%rsp)
movb $0, 113(%rsp)
movb $0, 114(%rsp)
movq %rax, 40(%rsp)
leaq 117(%rsp), %rax
movb $0, 115(%rsp)
movb $0, 116(%rsp)
movb $0, 117(%rsp)
movq %rax, 32(%rsp)
leaq 118(%rsp), %rax
movb $0, 118(%rsp)
movb $0, 119(%rsp)
movq %rax, 24(%rsp)
leaq 119(%rsp), %rax
movq %rax, 16(%rsp)
leaq 120(%rsp), %rax
movq %rax, 8(%rsp)
leaq 121(%rsp), %rax
movq %rax, (%rsp)
call g16
leaq 96(%rsp), %rdi
movb $0, 103(%rsp)
movb $0, 102(%rsp)
movb $0, 101(%rsp)
movb $0, 100(%rsp)
movb $0, 99(%rsp)
movb $0, 98(%rsp)
movb $0, 97(%rsp)
movb $0, 96(%rsp)
movb $0, 111(%rsp)
movb $0, 110(%rsp)
movb $0, 109(%rsp)
movb $0, 108(%rsp)
movb $0, 107(%rsp)
movb $0, 106(%rsp)
movb $0, 105(%rsp)
movb $0, 104(%rsp)
call g
pxor %xmm0, %xmm0
leaq 80(%rsp), %rdi
movdqa %xmm0, 80(%rsp)
call g
xorl %eax, %eax
addq $136, %rsp
ret
.cfi_endproc
.LFE0:
.size main, .-main
.ident "GCC: (Debian 4.4.2-8) 4.4.2"
.section .note.GNU-stack,"",@progbits
However it seems that gcc doesn't consider replacing multiple movbs
with movq inside loops and arrays.
sal...@salmin:~/test$ cat mov8.c
extern void g8(char *x0, char *x1, char *x2, char *x3,
char *x4, char *x5, char *x6, char *x7);
extern void g(char *z);
int main() {
char x0, x1, x2, x3, x4, x5, x6, x7;
char y[8];
char z[8];
int i;
x0 = x1 = x2 = x3 = x4 = x5 = x6 = x7 = 0;
g8(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7);
y[0] = y[1] = y[2] = y[3] = y[4] = y[5] = y[6] = y[7] = 0;
g(y);
for (i = 0; i < 8; i++)
z[i] = 0;
g(z);
return 0;
}
sal...@salmin:~/test$ gcc -S -O3 mov8.c
sal...@salmin:~/test$ cat mov8.s
.file "mov8.c"
.text
.p2align 4,,15
.globl main
.type main, @function
main:
.LFB0:
.cfi_startproc
subq $56, %rsp
.cfi_def_cfa_offset 64
leaq 40(%rsp), %rax
leaq 44(%rsp), %rcx
leaq 45(%rsp), %rdx
leaq 46(%rsp), %rsi
leaq 42(%rsp), %r9
leaq 43(%rsp), %r8
leaq 47(%rsp), %rdi
movq %rax, 8(%rsp)
leaq 41(%rsp), %rax
movb $0, 40(%rsp)
movb $0, 41(%rsp)
movq %rax, (%rsp)
movb $0, 42(%rsp)
movb $0, 43(%rsp)
movb $0, 44(%rsp)
movb $0, 45(%rsp)
movb $0, 46(%rsp)
movb $0, 47(%rsp)
call g8
leaq 32(%rsp), %rdi
movb $0, 39(%rsp)
movb $0, 38(%rsp)
movb $0, 37(%rsp)
movb $0, 36(%rsp)
movb $0, 35(%rsp)
movb $0, 34(%rsp)
movb $0, 33(%rsp)
movb $0, 32(%rsp)
call g
leaq 16(%rsp), %rdi
movb $0, 16(%rsp)
movb $0, 17(%rsp)
movb $0, 18(%rsp)
movb $0, 19(%rsp)
movb $0, 20(%rsp)
movb $0, 21(%rsp)
movb $0, 22(%rsp)
movb $0, 23(%rsp)
call g
xorl %eax, %eax
addq $56, %rsp
ret
.cfi_endproc
.LFE0:
.size main, .-main
.ident "GCC: (Debian 4.4.2-8) 4.4.2"
.section .note.GNU-stack,"",@progbits
Alexey