$ testcase: void transpose4x4 ( unsigned char* dst, unsigned char* src, int dst_stride, int src_stride ) { asm volatile( "movd %4, %%mm0 \n\t" "movd %5, %%mm1 \n\t" "movd %6, %%mm2 \n\t" "movd %7, %%mm3 \n\t" "punpcklbw %%mm1, %%mm0 \n\t" "punpcklbw %%mm3, %%mm2 \n\t" "movq %%mm0, %%mm1 \n\t" "punpcklwd %%mm2, %%mm0 \n\t" "punpckhwd %%mm2, %%mm1 \n\t" "movd %%mm0, %0 \n\t" "punpckhdq %%mm0, %%mm0 \n\t" "movd %%mm0, %1 \n\t" "movd %%mm1, %2 \n\t" "punpckhdq %%mm1, %%mm1 \n\t" "movd %%mm1, %3 \n\t" : "=m" (*(unsigned*)(dst + 0*dst_stride)), "=m" (*(unsigned*)(dst + 1*dst_stride)), "=m" (*(unsigned*)(dst + 2*dst_stride)), "=m" (*(unsigned*)(dst + 3*dst_stride)) : "m" (*(unsigned*)(src + 0*src_stride)), "m" (*(unsigned*)(src + 1*src_stride)), "m" (*(unsigned*)(src + 2*src_stride)), "m" (*(unsigned*)(src + 3*src_stride)) ); }
gcc-4.1 / -O2 produces: transpose4x4: leal (%rdx,%rdx), %r9d leal (%rcx,%rcx), %eax movslq %edx,%r11 movslq %ecx,%r8 movslq %r9d,%r10 addl %edx, %r9d movslq %eax,%rdx addl %ecx, %eax movslq %r9d,%r9 cltq #APP movd (%rsi), %mm0 movd (%rsi,%r8), %mm1 movd (%rsi,%rdx), %mm2 movd (%rsi,%rax), %mm3 punpcklbw %mm1, %mm0 punpcklbw %mm3, %mm2 movq %mm0, %mm1 punpcklwd %mm2, %mm0 punpckhwd %mm2, %mm1 movd %mm0, (%rdi) punpckhdq %mm0, %mm0 movd %mm0, (%rdi,%r11) movd %mm1, (%rdi,%r10) punpckhdq %mm1, %mm1 movd %mm1, (%rdi,%r9) #NO_APP ret [ 4.0.2 / -O2 ] produces smaller/faster code: transpose4x4: leal (%rdx,%rdx), %r8d movslq %edx,%r10 leaq (%rcx,%rcx,2), %rax movslq %r8d,%r9 addl %edx, %r8d movslq %r8d,%r8 #APP movd (%rsi), %mm0 movd (%rsi,%rcx), %mm1 movd (%rsi,%rcx,2), %mm2 movd (%rax,%rsi), %mm3 punpcklbw %mm1, %mm0 punpcklbw %mm3, %mm2 movq %mm0, %mm1 punpcklwd %mm2, %mm0 punpckhwd %mm2, %mm1 movd %mm0, (%rdi) punpckhdq %mm0, %mm0 movd %mm0, (%rdi,%r10) movd %mm1, (%rdi,%r9) punpckhdq %mm1, %mm1 movd %mm1, (%rdi,%r8) #NO_APP ret -- Summary: code quality regression. Product: gcc Version: 4.1.0 Status: UNCONFIRMED Severity: normal Priority: P3 Component: other AssignedTo: unassigned at gcc dot gnu dot org ReportedBy: pluto at agmk dot net GCC build triplet: x86-64 GCC host triplet: x86-64 GCC target triplet: x86-64 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=26056