Re: clearing many bytes variables (could use one machine instruction)?

Alexey Salmin Tue, 09 Mar 2010 07:25:01 -0800

On Tue, Mar 9, 2010 at 3:58 PM, Basile Starynkevitch
<bas...@starynkevitch.net> wrote:
> Hello All,
>
> With a recently compiled gcc-trunk on x86-64/linux, I am compiling the 
> folllowing example:
>
> #################
>
> /* file testmanychar.c */
> extern void g (int, char *, char *, char *);
>
> void
> f (void)
> {
>  char x0, x1, x2, x3, x4, x5, x6, x7;
>  /* assuming  x0 is word aligned on a x86_64, and variables are bytes in 
> memory, we could clear all the variables in one machine instruction */
>  x0 = x1 = x2 = x3 = x4 = x5 = x6 = x7 = (char) 0;
>  g (10, &x0, &x1, &x2);
>  g (20, &x2, &x3, &x4);
>  g (30, &x4, &x5, &x6);
>  g (40, &x6, &x7, &x0);
> }
>
> #################
>
> My intuition was that GCC could store x0 on a 64 bits aligned byte, and x1 
> immediately after, and so one, and clear all the eight bytes at once using a 
> single machine instruction [clearing a 64 bits word].
>
> But this is not the case, since
>   gcc-trunk -S -O3 -fverbose-asm testmanychar.c
> gives the following code
>
> #################
>        .type   f, @function
> f:
> .LFB0:
>        .cfi_startproc
>        movq    %rbx, -24(%rsp) #,
>        movq    %rbp, -16(%rsp) #,
>        movl    $10, %edi       #,
>        movq    %r12, -8(%rsp)  #,
>        subq    $40, %rsp       #,
>        .cfi_def_cfa_offset 48
>        leaq    13(%rsp), %rbx  #, tmp58
>        .cfi_offset 12, -16
>        .cfi_offset 6, -24
>        .cfi_offset 3, -32
>        leaq    15(%rsp), %rbp  #, tmp60
>        leaq    14(%rsp), %rdx  #, tmp59
>        leaq    11(%rsp), %r12  #, tmp61
>        movb    $0, 8(%rsp)     #, x7
>        movb    $0, 9(%rsp)     #, x6
>        movq    %rbx, %rcx      # tmp58,
>        movq    %rbp, %rsi      # tmp60,
>        movb    $0, 10(%rsp)    #, x5
>        movb    $0, 11(%rsp)    #, x4
>        movb    $0, 12(%rsp)    #, x3
>        movb    $0, 13(%rsp)    #, x2
>        movb    $0, 14(%rsp)    #, x1
>        movb    $0, 15(%rsp)    #, x0
>        call    g       #
>        leaq    12(%rsp), %rdx  #, tmp62
>        movq    %r12, %rcx      # tmp61,
>        movq    %rbx, %rsi      # tmp58,
>        movl    $20, %edi       #,
>        leaq    9(%rsp), %rbx   #, tmp64
>        call    g       #
>        leaq    10(%rsp), %rdx  #, tmp65
>        movq    %rbx, %rcx      # tmp64,
>        movq    %r12, %rsi      # tmp61,
>        movl    $30, %edi       #,
>        call    g       #
>        leaq    8(%rsp), %rdx   #, tmp68
>        movq    %rbp, %rcx      # tmp60,
>        movq    %rbx, %rsi      # tmp64,
>        movl    $40, %edi       #,
>        call    g       #
>        movq    16(%rsp), %rbx  #,
>        movq    24(%rsp), %rbp  #,
>        movq    32(%rsp), %r12  #,
>        addq    $40, %rsp       #,
>        .cfi_def_cfa_offset 8
>        ret
>        .cfi_endproc
> .LFE0:
>        .size   f, .-f
>        .ident  "GCC: (GNU) 4.5.0 20100309 (experimental) [trunk revision 
> 157303]"
>
> #####################
>
>
> With
>  gcc-trunk -S -O3 -fverbose-asm -march=core2 -mtune=core2 testmanychar.c
> I am getting still
>
> ##################
>
> # options passed:  testmanychar.c -march=core2 -mtune=core2 -O3
>
> .globl f
>        .type   f, @function
> f:
> .LFB0:
>        .cfi_startproc
>        movq    %rbx, -24(%rsp) #,
>        movq    %rbp, -16(%rsp) #,
>        movq    %r12, -8(%rsp)  #,
>        movl    $10, %edi       #,
>        subq    $40, %rsp       #,
>        .cfi_def_cfa_offset 48
>        leaq    13(%rsp), %rbx  #, tmp58
>        .cfi_offset 12, -16
>        .cfi_offset 6, -24
>        .cfi_offset 3, -32
>        leaq    15(%rsp), %rbp  #, tmp60
>        leaq    11(%rsp), %r12  #, tmp61
>        leaq    14(%rsp), %rdx  #, tmp59
>        movq    %rbx, %rcx      # tmp58,
>        movq    %rbp, %rsi      # tmp60,
>        movb    $0, 8(%rsp)     #, x7
>        movb    $0, 9(%rsp)     #, x6
>        movb    $0, 10(%rsp)    #, x5
>        movb    $0, 11(%rsp)    #, x4
>        movb    $0, 12(%rsp)    #, x3
>        movb    $0, 13(%rsp)    #, x2
>        movb    $0, 14(%rsp)    #, x1
>        movb    $0, 15(%rsp)    #, x0
>        call    g       #
>        leaq    12(%rsp), %rdx  #, tmp62
>        movq    %r12, %rcx      # tmp61,
>        movq    %rbx, %rsi      # tmp58,
>        movl    $20, %edi       #,
>        leaq    9(%rsp), %rbx   #, tmp64
>        call    g       #
>        leaq    10(%rsp), %rdx  #, tmp65
>        movq    %rbx, %rcx      # tmp64,
>        movq    %r12, %rsi      # tmp61,
>        movl    $30, %edi       #,
>        call    g       #
>        leaq    8(%rsp), %rdx   #, tmp68
>        movq    %rbp, %rcx      # tmp60,
>        movq    %rbx, %rsi      # tmp64,
>        movl    $40, %edi       #,
>        call    g       #
>        movq    16(%rsp), %rbx  #,
>        movq    24(%rsp), %rbp  #,
>        movq    32(%rsp), %r12  #,
>        addq    $40, %rsp       #,
>        .cfi_def_cfa_offset 8
>        ret
>        .cfi_endproc
> .LFE0:
>        .size   f, .-f
>        .ident  "GCC: (GNU) 4.5.0 20100309 (experimental) [trunk revision 
> 157303]"
>
> ####
> I was hoping that
>        movb    $0, 8(%rsp)     #, x7
>        movb    $0, 9(%rsp)     #, x6
>        movb    $0, 10(%rsp)    #, x5
>        movb    $0, 11(%rsp)    #, x4
>        movb    $0, 12(%rsp)    #, x3
>        movb    $0, 13(%rsp)    #, x2
>        movb    $0, 14(%rsp)    #, x1
>        movb    $0, 15(%rsp)    #, x0
> could be just something like
>        movq    $0, 8(%rsp)
> or something similar.
>
> I do realize that such an optimization is difficult to implement...
> (probably messing the register allocator, etc...). Or is the Core2 processor
> sufficient smart to execute exactly as fast a sequence of 8 consecutive byte
> moves as a single 8-byte word move?
>
>
> Regards.
> --
> Basile STARYNKEVITCH         http://starynkevitch.net/Basile/
> email: basile<at>starynkevitch<dot>net mobile: +33 6 8501 2359
> 8, rue de la Faiencerie, 92340 Bourg La Reine, France
> *** opinions {are only mines, sont seulement les miennes} ***
>


Thing you're talking about is a kind of vectorization. If you want to
simplify the vectorizing for the compiler you should store your data
in arrays instead of separate variables and use loops to process your
data instead of separate operations. In the following example gcc
vectorized operations only for the 'z' array.

sal...@salmin:~/test$ cat mov16.c
extern void g16(char *x0, char *x1, char *x2, char *x3,
                char *x4, char *x5, char *x6, char *x7,
                char *x8, char *x9, char *x10, char *x11,
                char *x12, char *x13, char *x14, char *x15);

extern void g(char *z);

int main() {
        char x0, x1, x2, x3, x4, x5, x6, x7;
        char x8, x9, x10, x11, x12, x13, x14, x15;
        char y[16];
        char z[16];
        int i;

        x0 = x1 = x2 = x3 = x4 = x5 = x6 = x7 = 0;
        x8 = x9 = x10 = x11 = x12 = x13 = x14 = x15 = 0;
        g16(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7,
            &x8, &x9, &x10, &x11, &x12, &x13, &x14, &x15);

        y[0] = y[1] = y[2] = y[3] = y[4] = y[5] = y[6] = y[7] = 0;
        y[8] = y[9] = y[10] = y[11] = y[12] = y[13] = y[14] = y[15] = 0;
        g(y);

        for (i = 0; i < 16; i++)
                z[i] = 0;
        g(z);

        return 0;
}
sal...@salmin:~/test$ gcc -S -O3 mov16.c
sal...@salmin:~/test$ cat mov16.s
        .file   "mov16.c"
        .text
        .p2align 4,,15
.globl main
        .type   main, @function
main:
.LFB0:
        .cfi_startproc
        subq    $136, %rsp
        .cfi_def_cfa_offset 144
        leaq    112(%rsp), %rax
        leaq    124(%rsp), %rcx
        leaq    125(%rsp), %rdx
        leaq    126(%rsp), %rsi
        leaq    122(%rsp), %r9
        leaq    123(%rsp), %r8
        movq    %rax, 72(%rsp)
        leaq    113(%rsp), %rax
        leaq    127(%rsp), %rdi
        movb    $0, 120(%rsp)
        movb    $0, 121(%rsp)
        movq    %rax, 64(%rsp)
        leaq    114(%rsp), %rax
        movb    $0, 122(%rsp)
        movb    $0, 123(%rsp)
        movb    $0, 124(%rsp)
        movq    %rax, 56(%rsp)
        leaq    115(%rsp), %rax
        movb    $0, 125(%rsp)
        movb    $0, 126(%rsp)
        movb    $0, 127(%rsp)
        movq    %rax, 48(%rsp)
        leaq    116(%rsp), %rax
        movb    $0, 112(%rsp)
        movb    $0, 113(%rsp)
        movb    $0, 114(%rsp)
        movq    %rax, 40(%rsp)
        leaq    117(%rsp), %rax
        movb    $0, 115(%rsp)
        movb    $0, 116(%rsp)
        movb    $0, 117(%rsp)
        movq    %rax, 32(%rsp)
        leaq    118(%rsp), %rax
        movb    $0, 118(%rsp)
        movb    $0, 119(%rsp)
        movq    %rax, 24(%rsp)
        leaq    119(%rsp), %rax
        movq    %rax, 16(%rsp)
        leaq    120(%rsp), %rax
        movq    %rax, 8(%rsp)
        leaq    121(%rsp), %rax
        movq    %rax, (%rsp)
        call    g16
        leaq    96(%rsp), %rdi
        movb    $0, 103(%rsp)
        movb    $0, 102(%rsp)
        movb    $0, 101(%rsp)
        movb    $0, 100(%rsp)
        movb    $0, 99(%rsp)
        movb    $0, 98(%rsp)
        movb    $0, 97(%rsp)
        movb    $0, 96(%rsp)
        movb    $0, 111(%rsp)
        movb    $0, 110(%rsp)
        movb    $0, 109(%rsp)
        movb    $0, 108(%rsp)
        movb    $0, 107(%rsp)
        movb    $0, 106(%rsp)
        movb    $0, 105(%rsp)
        movb    $0, 104(%rsp)
        call    g
        pxor    %xmm0, %xmm0
        leaq    80(%rsp), %rdi
        movdqa  %xmm0, 80(%rsp)
        call    g
        xorl    %eax, %eax
        addq    $136, %rsp
        ret
        .cfi_endproc
.LFE0:
        .size   main, .-main
        .ident  "GCC: (Debian 4.4.2-8) 4.4.2"
        .section        .note.GNU-stack,"",@progbits


However it seems that gcc doesn't consider replacing multiple movbs
with movq inside loops and arrays.

sal...@salmin:~/test$ cat mov8.c
extern void g8(char *x0, char *x1, char *x2, char *x3,
                char *x4, char *x5, char *x6, char *x7);

extern void g(char *z);

int main() {
        char x0, x1, x2, x3, x4, x5, x6, x7;
        char y[8];
        char z[8];
        int i;

        x0 = x1 = x2 = x3 = x4 = x5 = x6 = x7 = 0;
        g8(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7);

        y[0] = y[1] = y[2] = y[3] = y[4] = y[5] = y[6] = y[7] = 0;
        g(y);

        for (i = 0; i < 8; i++)
                z[i] = 0;
        g(z);

        return 0;
}
sal...@salmin:~/test$ gcc -S -O3 mov8.c
sal...@salmin:~/test$ cat mov8.s
        .file   "mov8.c"
        .text
        .p2align 4,,15
.globl main
        .type   main, @function
main:
.LFB0:
        .cfi_startproc
        subq    $56, %rsp
        .cfi_def_cfa_offset 64
        leaq    40(%rsp), %rax
        leaq    44(%rsp), %rcx
        leaq    45(%rsp), %rdx
        leaq    46(%rsp), %rsi
        leaq    42(%rsp), %r9
        leaq    43(%rsp), %r8
        leaq    47(%rsp), %rdi
        movq    %rax, 8(%rsp)
        leaq    41(%rsp), %rax
        movb    $0, 40(%rsp)
        movb    $0, 41(%rsp)
        movq    %rax, (%rsp)
        movb    $0, 42(%rsp)
        movb    $0, 43(%rsp)
        movb    $0, 44(%rsp)
        movb    $0, 45(%rsp)
        movb    $0, 46(%rsp)
        movb    $0, 47(%rsp)
        call    g8
        leaq    32(%rsp), %rdi
        movb    $0, 39(%rsp)
        movb    $0, 38(%rsp)
        movb    $0, 37(%rsp)
        movb    $0, 36(%rsp)
        movb    $0, 35(%rsp)
        movb    $0, 34(%rsp)
        movb    $0, 33(%rsp)
        movb    $0, 32(%rsp)
        call    g
        leaq    16(%rsp), %rdi
        movb    $0, 16(%rsp)
        movb    $0, 17(%rsp)
        movb    $0, 18(%rsp)
        movb    $0, 19(%rsp)
        movb    $0, 20(%rsp)
        movb    $0, 21(%rsp)
        movb    $0, 22(%rsp)
        movb    $0, 23(%rsp)
        call    g
        xorl    %eax, %eax
        addq    $56, %rsp
        ret
        .cfi_endproc
.LFE0:
        .size   main, .-main
        .ident  "GCC: (Debian 4.4.2-8) 4.4.2"
        .section        .note.GNU-stack,"",@progbits




Alexey

Re: clearing many bytes variables (could use one machine instruction)?

Reply via email to