------- Comment #6 from michaelni at gmx dot at 2008-03-22 02:15 ------- As Uros has "challenged me to beat performance of gcc-4.4 generated code by hand-crafted assembly using the example of PR 21395" heres my entry, sadly i only have gcc-4.3 compiled ATM for comparission but 4.3 generates better code than 4.4 so i guess thats ok its inner loop is: .L23: movq (%ecx,%eax,2), %mm0 psubw (%edx,%eax,2), %mm0 addl $4, %eax cmpl %eax, %ebx movq %mm0, %mm1 psraw $15, %mm0 pxor %mm0, %mm1 psubw %mm0, %mm1 movq %mm1, %mm0 punpcklwd %mm1, %mm1 punpckhwd %mm3, %mm0 psrad $16, %mm1 paddd %mm0, %mm1 paddd %mm1, %mm2 movq %mm2, -24(%ebp) jg .L23
Its better because the psraw doesnt depend on the previous movq result. Now heres my code (this is naivly written and not unrolled or hand scheduled, it also uses hardcoded registers, so i suspect it can be improved further ...) int SimpleBlockDiff::Diff () { #ifdef __MMX__ int sum; int x1b=-2*xl; int ylb= yl; asm volatile( "xorl %%edx, %%edx \n\t" "pcmpeqw %%mm6, %%mm6 \n\t" "pxor %%mm7, %%mm7 \n\t" "psrlw $15, %%mm6 \n\t" "1: \n\t" "movl (%1, %%edx, 4), %%eax \n\t" "movl (%2, %%edx, 4), %%esi \n\t" "movl %3, %%ecx \n\t" "subl %%ecx, %%eax \n\t" "subl %%ecx, %%esi \n\t" "2: \n\t" "pxor %%mm1, %%mm1 \n\t" "movq (%%eax, %%ecx), %%mm0\n\t" "psubw (%%esi, %%ecx), %%mm0\n\t" #if 0 "psubw %%mm0, %%mm1 \n\t" "pmaxsw %%mm1, %%mm0 \n\t" #else "pcmpgtw %%mm0, %%mm1 \n\t" "pxor %%mm1, %%mm0 \n\t" "psubw %%mm1, %%mm0 \n\t" #endif "pmaddwd %%mm6, %%mm0 \n\t" "paddd %%mm0, %%mm7 \n\t" "addl $8, %%ecx \n\t" " jnz 2b \n\t" "incl %%edx \n\t" "cmpl %%edx, %4 \n\t" " jnz 1b \n\t" "movq %%mm7, %%mm0 \n\t" "psrlq $32, %%mm7 \n\t" "paddd %%mm7, %%mm0 \n\t" "movd %%mm0, %0 \n\t" :"=g" (sum) :"r" (pic_data), "r" (ref_data), "m"(x1b), "m"(ylb) : "%eax", "%esi", "%ecx", "%edx" ); return sum; -------------- and benchmarks: on a duron: gcc-4.3: real 0m2.034s user 0m1.882s sys 0m0.017s asm: real 0m1.312s user 0m1.208s sys 0m0.016s on a 500mhz pentium3: gcc-4.3 real 0m4.021s user 0m3.767s sys 0m0.009s asm: real 0m2.827s user 0m2.565s sys 0m0.055s -- michaelni at gmx dot at changed: What |Removed |Added ---------------------------------------------------------------------------- CC| |michaelni at gmx dot at http://gcc.gnu.org/bugzilla/show_bug.cgi?id=21395