On 10 April 2015 at 20:18, John Colvin via D.gnu <d.gnu@puremagic.com> wrote: > void mul(float[] a, float v) > { > if ((cast(size_t)a.ptr) % 32 == 0 > && a.length == 16) > { > foreach (ref el; a) > el *= v; > } > } > > with > -Ofast -march=broadwell -frelease > becomes > > void example.mul(float[], float): > movq %rsi, %rax > andl $31, %eax > jne .L44 > cmpq $16, %rdi > jne .L44 > shrq $2, %rax > negq %rax > andl $7, %eax > je .L10 > vmulss (%rsi), %xmm0, %xmm1 > vmovss %xmm1, (%rsi) > cmpq $1, %rax > je .L11 > vmulss 4(%rsi), %xmm0, %xmm1 > vmovss %xmm1, 4(%rsi) > cmpq $2, %rax > je .L12 > vmulss 8(%rsi), %xmm0, %xmm1 > vmovss %xmm1, 8(%rsi) > cmpq $3, %rax > je .L13 > vmulss 12(%rsi), %xmm0, %xmm1 > vmovss %xmm1, 12(%rsi) > cmpq $4, %rax > je .L14 > vmulss 16(%rsi), %xmm0, %xmm1 > vmovss %xmm1, 16(%rsi) > cmpq $5, %rax > je .L15 > vmulss 20(%rsi), %xmm0, %xmm1 > vmovss %xmm1, 20(%rsi) > cmpq $6, %rax > je .L16 > vmulss 24(%rsi), %xmm0, %xmm1 > movl $9, %edx > movl $7, %r9d > vmovss %xmm1, 24(%rsi) > .L5: > movl $16, %edi > movl $8, %r8d > movl $1, %r10d > subq %rax, %rdi > .L4: > leaq (%rsi,%rax,4), %rcx > vbroadcastss %xmm0, %ymm1 > vmulps (%rcx), %ymm1, %ymm2 > vmovaps %ymm2, (%rcx) > cmpq $1, %r10 > je .L6 > vmulps 32(%rcx), %ymm1, %ymm1 > vmovaps %ymm1, 32(%rcx) > .L6: > leaq (%r9,%r8), %rax > subq %r8, %rdx > cmpq %r8, %rdi > je .L43 > leaq (%rsi,%rax,4), %rcx > vmulss (%rcx), %xmm0, %xmm1 > vmovss %xmm1, (%rcx) > leaq 1(%rax), %rcx > cmpq $1, %rdx > je .L43 > leaq (%rsi,%rcx,4), %rcx > vmulss (%rcx), %xmm0, %xmm1 > vmovss %xmm1, (%rcx) > leaq 2(%rax), %rcx > cmpq $2, %rdx > je .L43 > leaq (%rsi,%rcx,4), %rcx > vmulss (%rcx), %xmm0, %xmm1 > vmovss %xmm1, (%rcx) > leaq 3(%rax), %rcx > cmpq $3, %rdx > je .L43 > leaq (%rsi,%rcx,4), %rcx > vmulss (%rcx), %xmm0, %xmm1 > vmovss %xmm1, (%rcx) > leaq 4(%rax), %rcx > cmpq $4, %rdx > je .L43 > leaq (%rsi,%rcx,4), %rcx > vmulss (%rcx), %xmm0, %xmm1 > vmovss %xmm1, (%rcx) > leaq 5(%rax), %rcx > cmpq $5, %rdx > je .L43 > leaq (%rsi,%rcx,4), %rcx > addq $6, %rax > vmulss (%rcx), %xmm0, %xmm1 > vmovss %xmm1, (%rcx) > cmpq $6, %rdx > je .L43 > leaq (%rsi,%rax,4), %rax > vmulss (%rax), %xmm0, %xmm0 > vmovss %xmm0, (%rax) > vzeroupper > ret > .L43: > vzeroupper > .L44: > ret > .L10: > movl $16, %r8d > movl $2, %r10d > movl $16, %edi > movl $16, %edx > xorl %r9d, %r9d > jmp .L4 > .L11: > movl $15, %edx > movl $1, %r9d > jmp .L5 > .L16: > movl $10, %edx > movl $6, %r9d > jmp .L5 > .L15: > movl $11, %edx > movl $5, %r9d > jmp .L5 > .L14: > movl $12, %edx > movl $4, %r9d > jmp .L5 > .L13: > movl $13, %edx > movl $3, %r9d > jmp .L5 > .L12: > movl $14, %edx > movl $2, %r9d > jmp .L5 > > Which seems like an awful lot of code, wouldn't you say? > > I was expecting something along the lines of this (untested): > > void example.mul(float[], float): > testb $31, %sil > jne .L44 > cmpq $16, %rdi > jne .L44 > vbroadcastss xmm0, ymm2 > vmulps (%rsi), ymm2, ymm0 > vmulps 32(%rsi), ymm2, ymm1 > vmovaps ymm0, (%rsi) > vmovaps ymm1, 32(%rsi) > .L44: > ret > > Am I being stupid, or is the optimiser making a complete hash of things?
I fear that I cannot reproduce on gcc-5, maybe is a problem specific to your gcc version? _D6nested3mulFAffZv: testb $31, %sil jne .L8 cmpq $16, %rdi jne .L8 vbroadcastss %xmm0, %ymm0 vmulps (%rsi), %ymm0, %ymm1 vmulps 32(%rsi), %ymm0, %ymm0 vmovaps %ymm1, (%rsi) vmovaps %ymm0, 32(%rsi) vzeroupper .L8: ret Iain.