Performance improvements to the amd64 IP checksum code. Summing to alternate registers almost doubles the performace (probably from 4 to 6.2 bytes/clock) on Ivy Bridge cpu. Loop carrying the carry flag improves Haswell from 7 to 8 bytes/clock. Older cpu will still approach 4 bytes/clock. All achieved with a less loop unrolling - improving the performance for small buffers that are not a multiple of the loop span.
Signed-off-by: David Laight <david.lai...@aculab.com> -- Changes since v1: - Added cast to result. I spent far too long looking at this for some code that has to calculate the UDP checksum before sending packets through a raw IPv4 socket. Prior to Ivy (maybe Sandy) bridge adc always took two clocks, so the adc chain can only run at 4 bytes/clock (the same as 32bit adds to a 64 bit register). In Ivy bridge the 'carry' flag is available a clock earlier so 8 bytes/clock is technically possible. In order to 'loop carry' the carry flag some ingenuity is needed. Although 'dec reg' doesn't change carry, it has a partial dependency against the flags register - which makes the loop very slow. The loop instruction (dec %cx, jump non-zero) is far too slow on Intel cpu. But jcxz (jump if %cx zero) is about the same as a normal conditional jump. I did get about 12 bytes/clock using adox/adcx but that would need run-time patching and some cpu that support the instructions may run them very slowly. arch/x86/lib/csum-partial_64.c | 192 +++++++++++++++++++++-------------------- 1 file changed, 98 insertions(+), 94 deletions(-) diff --git a/arch/x86/lib/csum-partial_64.c b/arch/x86/lib/csum-partial_64.c index e7925d6..7f25ab2 100644 --- a/arch/x86/lib/csum-partial_64.c +++ b/arch/x86/lib/csum-partial_64.c @@ -10,113 +10,118 @@ #include <linux/export.h> #include <asm/checksum.h> -static inline unsigned short from32to16(unsigned a) -{ - unsigned short b = a >> 16; - asm("addw %w2,%w0\n\t" - "adcw $0,%w0\n" - : "=r" (b) - : "0" (b), "r" (a)); - return b; -} - /* * Do a 64-bit checksum on an arbitrary memory area. * Returns a 32bit checksum. * * This isn't as time critical as it used to be because many NICs * do hardware checksumming these days. + * + * All Intel cpus prior to Ivy Bridge (mayby Sandy Bridge) have a 2 clock + * latency on the result of adc. + * This limits any adc loop to 4 bytes/clock - the same as a C loop + * that adds 32 bit values to a 64 bit register. + * On Ivy bridge the adc result latency is still 2 clocks, but the carry + * latency is reduced to 1 clock. + * So summing to alternate registers increases the throughput to approaching + * 8 bytes/clock. + * Older cpu (eg core 2) have a 6+ clock delay accessing any of the flags + * after a partial update (eg adc after inc). + * The stange 'jecxz' loop avoids this. + * The Ivy bridge then needs the loop unrolling once more to approach + * 8 bytes per clock. * - * Things tried and found to not make it faster: - * Manual Prefetching - * Unrolling to an 128 bytes inner loop. - * Using interleaving with more registers to break the carry chains. + * On 7th gen cpu using adox/adoc can get 12 bytes/clock (maybe 16?) + * provided the loop is unrolled further than the one below. + * But some other cpu that support the adx take 6 clocks for each. + * + * The 'sum' value on entry is added in, it can exceed 32 bits but + * must not get to 56 bits. */ -static unsigned do_csum(const unsigned char *buff, unsigned len) +static unsigned do_csum(const unsigned char *buff, long len, u64 sum) { - unsigned odd, count; - unsigned long result = 0; + unsigned int src_align; + u64 sum_0 = 0, sum_1; + long len_tmp; + bool odd = false; - if (unlikely(len == 0)) - return result; - odd = 1 & (unsigned long) buff; - if (unlikely(odd)) { - result = *buff << 8; - len--; - buff++; - } - count = len >> 1; /* nr of 16-bit words.. */ - if (count) { - if (2 & (unsigned long) buff) { - result += *(unsigned short *)buff; - count--; - len -= 2; - buff += 2; + /* 64bit align the base address */ + src_align = (unsigned long)buff & 7; + if (src_align) { + if (unlikely(src_align & 1)) { + sum <<= 8; + /* The extra flag generates better code! */ + odd = true; } - count >>= 1; /* nr of 32-bit words.. */ - if (count) { - unsigned long zero; - unsigned count64; - if (4 & (unsigned long) buff) { - result += *(unsigned int *) buff; - count--; - len -= 4; - buff += 4; - } - count >>= 1; /* nr of 64-bit words.. */ + buff -= src_align; + len += src_align; + if (likely(src_align == 4)) + sum_0 = *(u32 *)(buff + 4); + else + /* Mask off unwanted low bytes from full 64bit word */ + sum_0 = *(u64 *)buff & (~0ull << (src_align * 8)); + if (unlikely(len < 8)) { + /* Mask off the unwanted high bytes */ + sum += sum_0 & ~(~0ull << (len * 8)); + goto reduce_32; + } + len -= 8; + buff += 8; + } - /* main loop using 64byte blocks */ - zero = 0; - count64 = count >> 3; - while (count64) { - asm("addq 0*8(%[src]),%[res]\n\t" - "adcq 1*8(%[src]),%[res]\n\t" - "adcq 2*8(%[src]),%[res]\n\t" - "adcq 3*8(%[src]),%[res]\n\t" - "adcq 4*8(%[src]),%[res]\n\t" - "adcq 5*8(%[src]),%[res]\n\t" - "adcq 6*8(%[src]),%[res]\n\t" - "adcq 7*8(%[src]),%[res]\n\t" - "adcq %[zero],%[res]" - : [res] "=r" (result) - : [src] "r" (buff), [zero] "r" (zero), - "[res]" (result)); - buff += 64; - count64--; - } + /* Read first 8 bytes to 16 byte align the loop below */ + sum_1 = len & 8 ? *(u64 *)buff : 0; - /* last up to 7 8byte blocks */ - count %= 8; - while (count) { - asm("addq %1,%0\n\t" - "adcq %2,%0\n" - : "=r" (result) - : "m" (*(unsigned long *)buff), - "r" (zero), "0" (result)); - --count; - buff += 8; - } - result = add32_with_carry(result>>32, - result&0xffffffff); + /* The main loop uses negative offsets from the end of the buffer */ + buff += len; - if (len & 4) { - result += *(unsigned int *) buff; - buff += 4; - } - } - if (len & 2) { - result += *(unsigned short *) buff; - buff += 2; - } + /* Add in trailing bytes to 64bit align the length */ + if (len & 7) { + unsigned int tail_len = len & 7; + buff -= tail_len; + if (likely(tail_len == 4)) + sum += *(u32 *)buff; + else + /* Mask off the unwanted high bytes */ + sum += *(u64 *)buff & ~(~0ull << (tail_len * 8)); } - if (len & 1) - result += *buff; - result = add32_with_carry(result>>32, result & 0xffffffff); - if (unlikely(odd)) { - result = from32to16(result); - result = ((result >> 8) & 0xff) | ((result & 0xff) << 8); - } - return result; + + /* Align and negate len so that we need to sum [buff[len]..buf[0]) */ + len = -(len & ~15); + + /* + * Align the byte count to a multiple of 16 then + * add 64 bit words to alternating registers. + * Finally reduce to 64 bits. + */ + asm( " bt $4, %[len]\n" + " jnc 10f\n" + " add (%[buff], %[len]), %[sum_0]\n" + " adc 8(%[buff], %[len]), %[sum_1]\n" + " lea 16(%[len]), %[len]\n" + "10: jecxz 20f\n" + " adc (%[buff], %[len]), %[sum_0]\n" + " adc 8(%[buff], %[len]), %[sum_1]\n" + " lea 32(%[len]), %[len_tmp]\n" + " adc 16(%[buff], %[len]), %[sum_0]\n" + " adc 24(%[buff], %[len]), %[sum_1]\n" + " mov %[len_tmp], %[len]\n" + " jmp 10b\n" + "20: adc %[sum_0], %[sum]\n" + " adc %[sum_1], %[sum]\n" + " adc $0, %[sum]\n" + : [sum] "+&r" (sum), [sum_0] "+&r" (sum_0), [sum_1] "+&r" (sum_1), + [len] "+&c" (len), [len_tmp] "=&r" (len_tmp) + : [buff] "r" (buff) + : "memory" ); + +reduce_32: + sum = add32_with_carry(sum>>32, sum & 0xffffffff); + + if (unlikely(odd)) + return __builtin_bswap32(sum); + + return sum; } /* @@ -133,8 +138,7 @@ static unsigned do_csum(const unsigned char *buff, unsigned len) */ __wsum csum_partial(const void *buff, int len, __wsum sum) { - return (__force __wsum)add32_with_carry(do_csum(buff, len), - (__force u32)sum); + return (__force __wsum)do_csum(buff, len, (__force u32)sum); } EXPORT_SYMBOL(csum_partial); -- 1.8.1.2 - Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK Registration No: 1397386 (Wales)