__uint128_t sqr_1(__uint64_t x) { return (x * (__uint128_t)x); } gcc-4.1.1-20060308 produces an ugly code:
sqr_1: xorl %edx, %edx # D.1810 movq %rdi, %rax # x, D.1810 movq %rdx, %rcx #, tmp62 imulq %rdi, %rcx # D.1810, tmp62 mulq %rdi # D.1810 addq %rcx, %rcx # tmp62 addq %rdx, %rcx #, tmp62 movq %rcx, %rdx # tmp62, ret the optimal solution is: movq %rdi, %rax mulq %rax ; or mulq %rdi ret __uint128_t sqr_2(__uint64_t x) { union { __uint128_t v; struct { __uint64_t lo; __uint64_t hi; } q; } r; asm volatile("mulq %%rax" : "=d" (r.q.hi), "=a" (r.q.lo) : "a" (x)); return r.v; } sqr_2 gives better code but still unoptimal :/ sqr_2: movq %rdi, %rax # x, x #APP mulq %rax #NO_APP movq %rdx, -16(%rsp) # tmp60, r.q.hi movq %rax, -24(%rsp) # tmp61, r.q.lo movq -16(%rsp), %rdx # r.v, r.v movq -24(%rsp), %rax # r.v, r.v ret -- Summary: missed optimization / 128-bit arithmetic. Product: gcc Version: 4.1.1 Status: UNCONFIRMED Severity: normal Priority: P3 Component: other AssignedTo: unassigned at gcc dot gnu dot org ReportedBy: pluto at agmk dot net GCC build triplet: x86-64-linux GCC host triplet: x86-64-linux GCC target triplet: x86-64-linux http://gcc.gnu.org/bugzilla/show_bug.cgi?id=26674