> 2011-10-13 Bruno Haible <br...@clisp.org> > > ffsl: Optimize on 32-bit platforms. > * lib/ffsl.h (FUNC): If TYPE has the same representation as 'int', just > use ffs() without a loop.
The other frequent case is that sizeof (TYPE) == 2 * sizeof (int). This means, the loop is executed just twice. And in the last loop round, we are making an unnecessary test: If j != 0 and the low 32 bits of j are known to be 0, we know that the high 32 bits of j must be non-zero. We don't need to test that. So I - introduced a variable k that counts the number of loop rounds, so as to avoid the non-zero test in the last loop round, - unrolled the loop, extracting the first and last loop round, letting the compiler realize that the middle part of the loop is empty code, - realized that this code also works when sizeof (TYPE) == sizeof (int), - removed the workaround for the gcc warning, that no longer exists. On MacOS X in 32-bit mode, disabling the GCC built-ins, the code for ffsll.c got simplified from ================================================================= .globl _ffsll _ffsll: pushl %esi movl 12(%esp), %ecx xorl %eax, %eax movl 8(%esp), %edx movl %ecx, %esi orl %edx, %esi je L6 xorl %esi, %esi testl %edx, %edx movl %edx, %eax jne L5 .align 4,0x90 L8: movl %ecx, %edx addl $32, %esi xorl %ecx, %ecx testl %edx, %edx movl %edx, %eax je L8 L5: bsfl %eax, %eax movl $-1, %edx cmove %edx, %eax incl %eax addl %esi, %eax L6: popl %esi ret ================================================================= to ================================================================= .globl _ffsll _ffsll: pushl %esi movl 12(%esp), %edx xorl %ecx, %ecx movl 8(%esp), %eax movl %edx, %esi orl %eax, %esi je L4 testl %eax, %eax movl %eax, %ecx jne L9 movl %edx, %eax movl $-1, %ecx bsfl %eax, %eax cmove %ecx, %eax incl %eax leal 32(%eax), %ecx L4: popl %esi movl %ecx, %eax ret .align 4,0x90 L9: bsfl %ecx, %ecx movl $-1, %eax popl %esi cmove %eax, %ecx incl %ecx movl %ecx, %eax ret ================================================================= Yes, this is simpler: it contains only 2 conditional branches, rather than 3. 2011-10-14 Bruno Haible <br...@clisp.org> ffsl: Optimize on 64-bit platforms. * lib/ffsl.h (FUNC): Omit a test from the last loop round. Do loop unrolling. *** lib/ffsl.h.orig Fri Oct 14 22:13:03 2011 --- lib/ffsl.h Fri Oct 14 22:12:47 2011 *************** *** 34,67 **** #if (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4)) && defined GCC_BUILTIN return GCC_BUILTIN (i); #else ! if (sizeof (TYPE) == sizeof (int)) ! return ffs (i); ! else { ! unsigned TYPE j = i; ! /* Split j into chunks, and look at one chunk after the other. */ ! /* Define chunk_bits so as to avoid a GCC warning ! "right shift count >= width of type" ! if sizeof (TYPE) == sizeof (int). */ ! enum ! { ! chunk_bits = (sizeof (TYPE) != sizeof (int) ! ? CHAR_BIT * sizeof (unsigned int) ! : 0) ! }; ! int result = 0; /* It is tempting to write if (!j) here, but if we do this, Solaris 10/x86 "cc -O" miscompiles the code. */ if (!i) return 0; ! while (1) ! { ! if ((unsigned int) j) ! return result + ffs ((unsigned int) j); ! j >>= chunk_bits; ! result += chunk_bits; ! } } #endif } --- 34,64 ---- #if (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4)) && defined GCC_BUILTIN return GCC_BUILTIN (i); #else ! unsigned TYPE j = i; ! /* Split j into chunks, and look at one chunk after the other. */ ! enum { chunk_bits = CHAR_BIT * sizeof (unsigned int) }; ! /* The number of chunks is ceil (sizeof (TYPE) / sizeof (unsigned int)) ! = (sizeof (TYPE) - 1) / sizeof (unsigned int) + 1. */ ! enum { chunk_count = (sizeof (TYPE) - 1) / sizeof (unsigned int) + 1 }; ! ! if (chunk_count > 1) { ! size_t k; /* It is tempting to write if (!j) here, but if we do this, Solaris 10/x86 "cc -O" miscompiles the code. */ if (!i) return 0; ! /* Unroll the first loop round. k = 0. */ ! if ((unsigned int) j) ! return ffs ((unsigned int) j); ! /* Generic loop. */ ! for (k = 1; k < chunk_count - 1; k++) ! if ((unsigned int) (j >> (k * chunk_bits)) != 0) ! return k * chunk_bits + ffs ((unsigned int) (j >> (k * chunk_bits))); } + /* Last loop round. k = chunk_count - 1. */ + return (chunk_count - 1) * chunk_bits + + ffs ((unsigned int) (j >> ((chunk_count - 1) * chunk_bits))); #endif } -- In memoriam Robert Dziekański <http://en.wikipedia.org/wiki/Robert_Dziekański>