On Fri, Jan 16, 2009 at 08:10:15PM +0100, Martin Michlmayr wrote:
* brian m. carlson <sand...@crustytoothpaste.ath.cx> [2009-01-16 18:38]:Obviously, since the two functions do the exact same thing, they should be optimized to be identical. Instead, mul is pessimized.Can you check if this happens with gcc-4.3 and trunk from SVN, and if so, file the bug upstream?
It does happen with gcc-4.3 (hence, I filed the bug there) as well as gcc-snapshot 20090107-1. I really would prefer not to build from SVN if I don't have to. I believe that it's the maintainers job to file the bug upstream, and upstream has not won any points with me for neglecting a bug that I reported with a trivial patch. Thus, I am hesitant to forward the bug myself. I have been informed that Apple's GCC does better on this[0]; I will see if I can get my friend to provide a .i and .s file from that version. I also just noted that gcc-4.1 and gcc-4.2 produce much less bad code. I've attached intrinsics.s from both of those files; they each use 8 movss and 4 mulss. Nevertheless, they still do not convert the code into three SSE instructions. [0] I believe Apple's GCC is based on an older version of FSF gcc, which is probably why it does better; the results are likely identical between the two. -- brian m. carlson / brian with sandals: Houston, Texas, US +1 713 440 7475 | http://crustytoothpaste.ath.cx/~bmc | My opinion only troff on top of XML: http://crustytoothpaste.ath.cx/~bmc/code/thwack OpenPGP: RSA v4 4096b 88AC E9B2 9196 305B A994 7552 F1BA 225C 0223 B187
.file "intrinsics.c" .text .p2align 4,,15 .globl mul .type mul, @function mul: .LFB491: movss (%rdi), %xmm0 mulss (%rsi), %xmm0 movss %xmm0, (%rdx) movss 4(%rdi), %xmm0 mulss 4(%rsi), %xmm0 movss %xmm0, 4(%rdx) movss 8(%rdi), %xmm0 mulss 8(%rsi), %xmm0 movss %xmm0, 8(%rdx) movss 12(%rdi), %xmm0 mulss 12(%rsi), %xmm0 movss %xmm0, 12(%rdx) ret .LFE491: .size mul, .-mul .p2align 4,,15 .globl mul2 .type mul2, @function mul2: .LFB492: movaps (%rdi), %xmm0 mulps (%rsi), %xmm0 movaps %xmm0, (%rdx) ret .LFE492: .size mul2, .-mul2 .section .rodata.str1.1,"aMS",@progbits,1 .LC8: .string "%f %f %f %f\n" .section .rodata.cst8,"aM",@progbits,8 .align 8 .LC13: .long 1610612736 .long -1071225242 .align 8 .LC14: .long 3758096384 .long 1075212451 .align 8 .LC15: .long 536870912 .long 1075983155 .align 8 .LC16: .long 3221225472 .long -1075125945 .text .p2align 4,,15 .globl main .type main, @function main: .LFB493: subq $56, %rsp .LCFI0: movl $.LC8, %edi movl $4, %eax movsd .LC13(%rip), %xmm3 movl $0x3f99999a, 32(%rsp) movsd .LC14(%rip), %xmm2 movl $0x40600000, 36(%rsp) movsd .LC15(%rip), %xmm1 movl $0x3fd9999a, 40(%rsp) movsd .LC16(%rip), %xmm0 movl $0x40333333, 44(%rsp) movl $0xbf333333, 16(%rsp) movl $0x40266666, 20(%rsp) movl $0x40533333, 24(%rsp) movl $0xc0800000, 28(%rsp) movl $0xbf570a3e, (%rsp) movl $0x41119999, 4(%rsp) movl $0x40b3851f, 8(%rsp) movl $0xc1333333, 12(%rsp) call printf xorl %eax, %eax addq $56, %rsp ret .LFE493: .size main, .-main .section .eh_frame,"a",@progbits .Lframe1: .long .LECIE1-.LSCIE1 .LSCIE1: .long 0x0 .byte 0x1 .string "zR" .uleb128 0x1 .sleb128 -8 .byte 0x10 .uleb128 0x1 .byte 0x3 .byte 0xc .uleb128 0x7 .uleb128 0x8 .byte 0x90 .uleb128 0x1 .align 8 .LECIE1: .LSFDE1: .long .LEFDE1-.LASFDE1 .LASFDE1: .long .LASFDE1-.Lframe1 .long .LFB491 .long .LFE491-.LFB491 .uleb128 0x0 .align 8 .LEFDE1: .LSFDE3: .long .LEFDE3-.LASFDE3 .LASFDE3: .long .LASFDE3-.Lframe1 .long .LFB492 .long .LFE492-.LFB492 .uleb128 0x0 .align 8 .LEFDE3: .LSFDE5: .long .LEFDE5-.LASFDE5 .LASFDE5: .long .LASFDE5-.Lframe1 .long .LFB493 .long .LFE493-.LFB493 .uleb128 0x0 .byte 0x4 .long .LCFI0-.LFB493 .byte 0xe .uleb128 0x40 .align 8 .LEFDE5: .ident "GCC: (GNU) 4.1.3 20080704 (prerelease) (Debian 4.1.2-24)" .section .note.GNU-stack,"",@progbits
.file "intrinsics.c" .text .p2align 4,,15 .globl mul .type mul, @function mul: .LFB513: movss (%rdi), %xmm0 mulss (%rsi), %xmm0 movss %xmm0, (%rdx) movss 4(%rdi), %xmm0 mulss 4(%rsi), %xmm0 movss %xmm0, 4(%rdx) movss 8(%rdi), %xmm0 mulss 8(%rsi), %xmm0 movss %xmm0, 8(%rdx) movss 12(%rdi), %xmm0 mulss 12(%rsi), %xmm0 movss %xmm0, 12(%rdx) ret .LFE513: .size mul, .-mul .p2align 4,,15 .globl mul2 .type mul2, @function mul2: .LFB514: movaps (%rdi), %xmm0 mulps (%rsi), %xmm0 movaps %xmm0, (%rdx) ret .LFE514: .size mul2, .-mul2 .section .rodata.str1.1,"aMS",@progbits,1 .LC8: .string "%f %f %f %f\n" .text .p2align 4,,15 .globl main .type main, @function main: .LFB515: subq $56, %rsp .LCFI0: movl $.LC8, %edi movl $4, %eax movsd .LC14(%rip), %xmm3 movl $0x3f99999a, 32(%rsp) movsd .LC15(%rip), %xmm2 movl $0x40600000, 36(%rsp) movsd .LC16(%rip), %xmm1 movl $0x3fd9999a, 40(%rsp) movsd .LC10(%rip), %xmm0 movl $0x40333333, 44(%rsp) movl $0xbf333333, 16(%rsp) movl $0x40266666, 20(%rsp) movl $0x40533333, 24(%rsp) movl $0xc0800000, 28(%rsp) movl $0xbf570a3e, (%rsp) movl $0x41119999, 4(%rsp) movl $0x40b3851f, 8(%rsp) movl $0xc1333333, 12(%rsp) call printf xorl %eax, %eax addq $56, %rsp ret .LFE515: .size main, .-main .section .rodata.cst8,"aM",@progbits,8 .align 8 .LC10: .long 3221225472 .long -1075125945 .align 8 .LC14: .long 1610612736 .long -1071225242 .align 8 .LC15: .long 3758096384 .long 1075212451 .align 8 .LC16: .long 536870912 .long 1075983155 .section .eh_frame,"a",@progbits .Lframe1: .long .LECIE1-.LSCIE1 .LSCIE1: .long 0x0 .byte 0x1 .string "zR" .uleb128 0x1 .sleb128 -8 .byte 0x10 .uleb128 0x1 .byte 0x3 .byte 0xc .uleb128 0x7 .uleb128 0x8 .byte 0x90 .uleb128 0x1 .align 8 .LECIE1: .LSFDE1: .long .LEFDE1-.LASFDE1 .LASFDE1: .long .LASFDE1-.Lframe1 .long .LFB513 .long .LFE513-.LFB513 .uleb128 0x0 .align 8 .LEFDE1: .LSFDE3: .long .LEFDE3-.LASFDE3 .LASFDE3: .long .LASFDE3-.Lframe1 .long .LFB514 .long .LFE514-.LFB514 .uleb128 0x0 .align 8 .LEFDE3: .LSFDE5: .long .LEFDE5-.LASFDE5 .LASFDE5: .long .LASFDE5-.Lframe1 .long .LFB515 .long .LFE515-.LFB515 .uleb128 0x0 .byte 0x4 .long .LCFI0-.LFB515 .byte 0xe .uleb128 0x40 .align 8 .LEFDE5: .ident "GCC: (GNU) 4.2.4 (Debian 4.2.4-5)" .section .note.GNU-stack,"",@progbits
signature.asc
Description: Digital signature