On Tue, Mar 9, 2010 at 3:58 PM, Basile Starynkevitch <bas...@starynkevitch.net> wrote: > Hello All, > > With a recently compiled gcc-trunk on x86-64/linux, I am compiling the > folllowing example: > > ################# > > /* file testmanychar.c */ > extern void g (int, char *, char *, char *); > > void > f (void) > { > char x0, x1, x2, x3, x4, x5, x6, x7; > /* assuming x0 is word aligned on a x86_64, and variables are bytes in > memory, we could clear all the variables in one machine instruction */ > x0 = x1 = x2 = x3 = x4 = x5 = x6 = x7 = (char) 0; > g (10, &x0, &x1, &x2); > g (20, &x2, &x3, &x4); > g (30, &x4, &x5, &x6); > g (40, &x6, &x7, &x0); > } > > ################# > > My intuition was that GCC could store x0 on a 64 bits aligned byte, and x1 > immediately after, and so one, and clear all the eight bytes at once using a > single machine instruction [clearing a 64 bits word]. > > But this is not the case, since > gcc-trunk -S -O3 -fverbose-asm testmanychar.c > gives the following code > > ################# > .type f, @function > f: > .LFB0: > .cfi_startproc > movq %rbx, -24(%rsp) #, > movq %rbp, -16(%rsp) #, > movl $10, %edi #, > movq %r12, -8(%rsp) #, > subq $40, %rsp #, > .cfi_def_cfa_offset 48 > leaq 13(%rsp), %rbx #, tmp58 > .cfi_offset 12, -16 > .cfi_offset 6, -24 > .cfi_offset 3, -32 > leaq 15(%rsp), %rbp #, tmp60 > leaq 14(%rsp), %rdx #, tmp59 > leaq 11(%rsp), %r12 #, tmp61 > movb $0, 8(%rsp) #, x7 > movb $0, 9(%rsp) #, x6 > movq %rbx, %rcx # tmp58, > movq %rbp, %rsi # tmp60, > movb $0, 10(%rsp) #, x5 > movb $0, 11(%rsp) #, x4 > movb $0, 12(%rsp) #, x3 > movb $0, 13(%rsp) #, x2 > movb $0, 14(%rsp) #, x1 > movb $0, 15(%rsp) #, x0 > call g # > leaq 12(%rsp), %rdx #, tmp62 > movq %r12, %rcx # tmp61, > movq %rbx, %rsi # tmp58, > movl $20, %edi #, > leaq 9(%rsp), %rbx #, tmp64 > call g # > leaq 10(%rsp), %rdx #, tmp65 > movq %rbx, %rcx # tmp64, > movq %r12, %rsi # tmp61, > movl $30, %edi #, > call g # > leaq 8(%rsp), %rdx #, tmp68 > movq %rbp, %rcx # tmp60, > movq %rbx, %rsi # tmp64, > movl $40, %edi #, > call g # > movq 16(%rsp), %rbx #, > movq 24(%rsp), %rbp #, > movq 32(%rsp), %r12 #, > addq $40, %rsp #, > .cfi_def_cfa_offset 8 > ret > .cfi_endproc > .LFE0: > .size f, .-f > .ident "GCC: (GNU) 4.5.0 20100309 (experimental) [trunk revision > 157303]" > > ##################### > > > With > gcc-trunk -S -O3 -fverbose-asm -march=core2 -mtune=core2 testmanychar.c > I am getting still > > ################## > > # options passed: testmanychar.c -march=core2 -mtune=core2 -O3 > > .globl f > .type f, @function > f: > .LFB0: > .cfi_startproc > movq %rbx, -24(%rsp) #, > movq %rbp, -16(%rsp) #, > movq %r12, -8(%rsp) #, > movl $10, %edi #, > subq $40, %rsp #, > .cfi_def_cfa_offset 48 > leaq 13(%rsp), %rbx #, tmp58 > .cfi_offset 12, -16 > .cfi_offset 6, -24 > .cfi_offset 3, -32 > leaq 15(%rsp), %rbp #, tmp60 > leaq 11(%rsp), %r12 #, tmp61 > leaq 14(%rsp), %rdx #, tmp59 > movq %rbx, %rcx # tmp58, > movq %rbp, %rsi # tmp60, > movb $0, 8(%rsp) #, x7 > movb $0, 9(%rsp) #, x6 > movb $0, 10(%rsp) #, x5 > movb $0, 11(%rsp) #, x4 > movb $0, 12(%rsp) #, x3 > movb $0, 13(%rsp) #, x2 > movb $0, 14(%rsp) #, x1 > movb $0, 15(%rsp) #, x0 > call g # > leaq 12(%rsp), %rdx #, tmp62 > movq %r12, %rcx # tmp61, > movq %rbx, %rsi # tmp58, > movl $20, %edi #, > leaq 9(%rsp), %rbx #, tmp64 > call g # > leaq 10(%rsp), %rdx #, tmp65 > movq %rbx, %rcx # tmp64, > movq %r12, %rsi # tmp61, > movl $30, %edi #, > call g # > leaq 8(%rsp), %rdx #, tmp68 > movq %rbp, %rcx # tmp60, > movq %rbx, %rsi # tmp64, > movl $40, %edi #, > call g # > movq 16(%rsp), %rbx #, > movq 24(%rsp), %rbp #, > movq 32(%rsp), %r12 #, > addq $40, %rsp #, > .cfi_def_cfa_offset 8 > ret > .cfi_endproc > .LFE0: > .size f, .-f > .ident "GCC: (GNU) 4.5.0 20100309 (experimental) [trunk revision > 157303]" > > #### > I was hoping that > movb $0, 8(%rsp) #, x7 > movb $0, 9(%rsp) #, x6 > movb $0, 10(%rsp) #, x5 > movb $0, 11(%rsp) #, x4 > movb $0, 12(%rsp) #, x3 > movb $0, 13(%rsp) #, x2 > movb $0, 14(%rsp) #, x1 > movb $0, 15(%rsp) #, x0 > could be just something like > movq $0, 8(%rsp) > or something similar. > > I do realize that such an optimization is difficult to implement... > (probably messing the register allocator, etc...). Or is the Core2 processor > sufficient smart to execute exactly as fast a sequence of 8 consecutive byte > moves as a single 8-byte word move? > > > Regards. > -- > Basile STARYNKEVITCH http://starynkevitch.net/Basile/ > email: basile<at>starynkevitch<dot>net mobile: +33 6 8501 2359 > 8, rue de la Faiencerie, 92340 Bourg La Reine, France > *** opinions {are only mines, sont seulement les miennes} *** >
Thing you're talking about is a kind of vectorization. If you want to simplify the vectorizing for the compiler you should store your data in arrays instead of separate variables and use loops to process your data instead of separate operations. In the following example gcc vectorized operations only for the 'z' array. sal...@salmin:~/test$ cat mov16.c extern void g16(char *x0, char *x1, char *x2, char *x3, char *x4, char *x5, char *x6, char *x7, char *x8, char *x9, char *x10, char *x11, char *x12, char *x13, char *x14, char *x15); extern void g(char *z); int main() { char x0, x1, x2, x3, x4, x5, x6, x7; char x8, x9, x10, x11, x12, x13, x14, x15; char y[16]; char z[16]; int i; x0 = x1 = x2 = x3 = x4 = x5 = x6 = x7 = 0; x8 = x9 = x10 = x11 = x12 = x13 = x14 = x15 = 0; g16(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7, &x8, &x9, &x10, &x11, &x12, &x13, &x14, &x15); y[0] = y[1] = y[2] = y[3] = y[4] = y[5] = y[6] = y[7] = 0; y[8] = y[9] = y[10] = y[11] = y[12] = y[13] = y[14] = y[15] = 0; g(y); for (i = 0; i < 16; i++) z[i] = 0; g(z); return 0; } sal...@salmin:~/test$ gcc -S -O3 mov16.c sal...@salmin:~/test$ cat mov16.s .file "mov16.c" .text .p2align 4,,15 .globl main .type main, @function main: .LFB0: .cfi_startproc subq $136, %rsp .cfi_def_cfa_offset 144 leaq 112(%rsp), %rax leaq 124(%rsp), %rcx leaq 125(%rsp), %rdx leaq 126(%rsp), %rsi leaq 122(%rsp), %r9 leaq 123(%rsp), %r8 movq %rax, 72(%rsp) leaq 113(%rsp), %rax leaq 127(%rsp), %rdi movb $0, 120(%rsp) movb $0, 121(%rsp) movq %rax, 64(%rsp) leaq 114(%rsp), %rax movb $0, 122(%rsp) movb $0, 123(%rsp) movb $0, 124(%rsp) movq %rax, 56(%rsp) leaq 115(%rsp), %rax movb $0, 125(%rsp) movb $0, 126(%rsp) movb $0, 127(%rsp) movq %rax, 48(%rsp) leaq 116(%rsp), %rax movb $0, 112(%rsp) movb $0, 113(%rsp) movb $0, 114(%rsp) movq %rax, 40(%rsp) leaq 117(%rsp), %rax movb $0, 115(%rsp) movb $0, 116(%rsp) movb $0, 117(%rsp) movq %rax, 32(%rsp) leaq 118(%rsp), %rax movb $0, 118(%rsp) movb $0, 119(%rsp) movq %rax, 24(%rsp) leaq 119(%rsp), %rax movq %rax, 16(%rsp) leaq 120(%rsp), %rax movq %rax, 8(%rsp) leaq 121(%rsp), %rax movq %rax, (%rsp) call g16 leaq 96(%rsp), %rdi movb $0, 103(%rsp) movb $0, 102(%rsp) movb $0, 101(%rsp) movb $0, 100(%rsp) movb $0, 99(%rsp) movb $0, 98(%rsp) movb $0, 97(%rsp) movb $0, 96(%rsp) movb $0, 111(%rsp) movb $0, 110(%rsp) movb $0, 109(%rsp) movb $0, 108(%rsp) movb $0, 107(%rsp) movb $0, 106(%rsp) movb $0, 105(%rsp) movb $0, 104(%rsp) call g pxor %xmm0, %xmm0 leaq 80(%rsp), %rdi movdqa %xmm0, 80(%rsp) call g xorl %eax, %eax addq $136, %rsp ret .cfi_endproc .LFE0: .size main, .-main .ident "GCC: (Debian 4.4.2-8) 4.4.2" .section .note.GNU-stack,"",@progbits However it seems that gcc doesn't consider replacing multiple movbs with movq inside loops and arrays. sal...@salmin:~/test$ cat mov8.c extern void g8(char *x0, char *x1, char *x2, char *x3, char *x4, char *x5, char *x6, char *x7); extern void g(char *z); int main() { char x0, x1, x2, x3, x4, x5, x6, x7; char y[8]; char z[8]; int i; x0 = x1 = x2 = x3 = x4 = x5 = x6 = x7 = 0; g8(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7); y[0] = y[1] = y[2] = y[3] = y[4] = y[5] = y[6] = y[7] = 0; g(y); for (i = 0; i < 8; i++) z[i] = 0; g(z); return 0; } sal...@salmin:~/test$ gcc -S -O3 mov8.c sal...@salmin:~/test$ cat mov8.s .file "mov8.c" .text .p2align 4,,15 .globl main .type main, @function main: .LFB0: .cfi_startproc subq $56, %rsp .cfi_def_cfa_offset 64 leaq 40(%rsp), %rax leaq 44(%rsp), %rcx leaq 45(%rsp), %rdx leaq 46(%rsp), %rsi leaq 42(%rsp), %r9 leaq 43(%rsp), %r8 leaq 47(%rsp), %rdi movq %rax, 8(%rsp) leaq 41(%rsp), %rax movb $0, 40(%rsp) movb $0, 41(%rsp) movq %rax, (%rsp) movb $0, 42(%rsp) movb $0, 43(%rsp) movb $0, 44(%rsp) movb $0, 45(%rsp) movb $0, 46(%rsp) movb $0, 47(%rsp) call g8 leaq 32(%rsp), %rdi movb $0, 39(%rsp) movb $0, 38(%rsp) movb $0, 37(%rsp) movb $0, 36(%rsp) movb $0, 35(%rsp) movb $0, 34(%rsp) movb $0, 33(%rsp) movb $0, 32(%rsp) call g leaq 16(%rsp), %rdi movb $0, 16(%rsp) movb $0, 17(%rsp) movb $0, 18(%rsp) movb $0, 19(%rsp) movb $0, 20(%rsp) movb $0, 21(%rsp) movb $0, 22(%rsp) movb $0, 23(%rsp) call g xorl %eax, %eax addq $56, %rsp ret .cfi_endproc .LFE0: .size main, .-main .ident "GCC: (Debian 4.4.2-8) 4.4.2" .section .note.GNU-stack,"",@progbits Alexey