[PATCH v2] x86: Optimise x86 IP checksum code

David Laight Sun, 10 May 2020 12:04:09 -0700

Performance improvements to the amd64 IP checksum code.
Summing to alternate registers almost doubles the performace
(probably from 4 to 6.2 bytes/clock) on Ivy Bridge cpu.
Loop carrying the carry flag improves Haswell from 7 to 8 bytes/clock.
Older cpu will still approach 4 bytes/clock.
All achieved with a less loop unrolling - improving the performance
for small buffers that are not a multiple of the loop span.


Signed-off-by: David Laight <david.lai...@aculab.com>
--

Changes since v1:
- Added cast to result.

I spent far too long looking at this for some code that has to calculate the
UDP checksum before sending packets through a raw IPv4 socket.

Prior to Ivy (maybe Sandy) bridge adc always took two clocks, so the adc chain
can only run at 4 bytes/clock (the same as 32bit adds to a 64 bit register).
In Ivy bridge the 'carry' flag is available a clock earlier so 8 bytes/clock
is technically possible.

In order to 'loop carry' the carry flag some ingenuity is needed.
Although 'dec reg' doesn't change carry, it has a partial dependency against
the flags register - which makes the loop very slow.
The loop instruction (dec %cx, jump non-zero) is far too slow on Intel cpu.
But jcxz (jump if %cx zero) is about the same as a normal conditional jump.

I did get about 12 bytes/clock using adox/adcx but that would need run-time
patching and some cpu that support the instructions may run them very slowly.

arch/x86/lib/csum-partial_64.c | 192 +++++++++++++++++++++--------------------
 1 file changed, 98 insertions(+), 94 deletions(-)

diff --git a/arch/x86/lib/csum-partial_64.c b/arch/x86/lib/csum-partial_64.c
index e7925d6..7f25ab2 100644
--- a/arch/x86/lib/csum-partial_64.c
+++ b/arch/x86/lib/csum-partial_64.c
@@ -10,113 +10,118 @@
 #include <linux/export.h>
 #include <asm/checksum.h>
 
-static inline unsigned short from32to16(unsigned a) 
-{
-       unsigned short b = a >> 16; 
-       asm("addw %w2,%w0\n\t"
-           "adcw $0,%w0\n" 
-           : "=r" (b)
-           : "0" (b), "r" (a));
-       return b;
-}
-
 /*
  * Do a 64-bit checksum on an arbitrary memory area.
  * Returns a 32bit checksum.
  *
  * This isn't as time critical as it used to be because many NICs
  * do hardware checksumming these days.
+ *
+ * All Intel cpus prior to Ivy Bridge (mayby Sandy Bridge) have a 2 clock
+ * latency on the result of adc.
+ * This limits any adc loop to 4 bytes/clock - the same as a C loop
+ * that adds 32 bit values to a 64 bit register.
+ * On Ivy bridge the adc result latency is still 2 clocks, but the carry
+ * latency is reduced to 1 clock.
+ * So summing to alternate registers increases the throughput to approaching
+ * 8 bytes/clock.
+ * Older cpu (eg core 2) have a 6+ clock delay accessing any of the flags
+ * after a partial update (eg adc after inc).
+ * The stange 'jecxz' loop avoids this.
+ * The Ivy bridge then needs the loop unrolling once more to approach
+ * 8 bytes per clock.
  * 
- * Things tried and found to not make it faster:
- * Manual Prefetching
- * Unrolling to an 128 bytes inner loop.
- * Using interleaving with more registers to break the carry chains.
+ * On 7th gen cpu using adox/adoc can get 12 bytes/clock (maybe 16?)
+ * provided the loop is unrolled further than the one below.
+ * But some other cpu that support the adx take 6 clocks for each.
+ *
+ * The 'sum' value on entry is added in, it can exceed 32 bits but
+ * must not get to 56 bits.
  */
-static unsigned do_csum(const unsigned char *buff, unsigned len)
+static unsigned do_csum(const unsigned char *buff, long len, u64 sum)
 {
-       unsigned odd, count;
-       unsigned long result = 0;
+       unsigned int src_align;
+       u64 sum_0 = 0, sum_1;
+       long len_tmp;
+       bool odd = false;
 
-       if (unlikely(len == 0))
-               return result; 
-       odd = 1 & (unsigned long) buff;
-       if (unlikely(odd)) {
-               result = *buff << 8;
-               len--;
-               buff++;
-       }
-       count = len >> 1;               /* nr of 16-bit words.. */
-       if (count) {
-               if (2 & (unsigned long) buff) {
-                       result += *(unsigned short *)buff;
-                       count--;
-                       len -= 2;
-                       buff += 2;
+       /* 64bit align the base address */
+       src_align = (unsigned long)buff & 7;
+       if (src_align) {
+               if (unlikely(src_align & 1)) {
+                       sum <<= 8;
+                       /* The extra flag generates better code! */
+                       odd = true;
                }
-               count >>= 1;            /* nr of 32-bit words.. */
-               if (count) {
-                       unsigned long zero;
-                       unsigned count64;
-                       if (4 & (unsigned long) buff) {
-                               result += *(unsigned int *) buff;
-                               count--;
-                               len -= 4;
-                               buff += 4;
-                       }
-                       count >>= 1;    /* nr of 64-bit words.. */
+               buff -= src_align;
+               len += src_align;
+               if (likely(src_align == 4))
+                       sum_0 = *(u32 *)(buff + 4);
+               else
+                       /* Mask off unwanted low bytes from full 64bit word */
+                       sum_0 = *(u64 *)buff & (~0ull << (src_align * 8));
+               if (unlikely(len < 8)) {
+                       /* Mask off the unwanted high bytes */
+                       sum += sum_0 & ~(~0ull << (len * 8));
+                       goto reduce_32;
+               }
+               len -= 8;
+               buff += 8;
+       }
 
-                       /* main loop using 64byte blocks */
-                       zero = 0;
-                       count64 = count >> 3;
-                       while (count64) { 
-                               asm("addq 0*8(%[src]),%[res]\n\t"
-                                   "adcq 1*8(%[src]),%[res]\n\t"
-                                   "adcq 2*8(%[src]),%[res]\n\t"
-                                   "adcq 3*8(%[src]),%[res]\n\t"
-                                   "adcq 4*8(%[src]),%[res]\n\t"
-                                   "adcq 5*8(%[src]),%[res]\n\t"
-                                   "adcq 6*8(%[src]),%[res]\n\t"
-                                   "adcq 7*8(%[src]),%[res]\n\t"
-                                   "adcq %[zero],%[res]"
-                                   : [res] "=r" (result)
-                                   : [src] "r" (buff), [zero] "r" (zero),
-                                   "[res]" (result));
-                               buff += 64;
-                               count64--;
-                       }
+       /* Read first 8 bytes to 16 byte align the loop below */
+       sum_1 = len & 8 ? *(u64 *)buff : 0;
 
-                       /* last up to 7 8byte blocks */
-                       count %= 8; 
-                       while (count) { 
-                               asm("addq %1,%0\n\t"
-                                   "adcq %2,%0\n" 
-                                           : "=r" (result)
-                                   : "m" (*(unsigned long *)buff), 
-                                   "r" (zero),  "0" (result));
-                               --count; 
-                               buff += 8;
-                       }
-                       result = add32_with_carry(result>>32,
-                                                 result&0xffffffff); 
+       /* The main loop uses negative offsets from the end of the buffer */
+       buff += len;
 
-                       if (len & 4) {
-                               result += *(unsigned int *) buff;
-                               buff += 4;
-                       }
-               }
-               if (len & 2) {
-                       result += *(unsigned short *) buff;
-                       buff += 2;
-               }
+       /* Add in trailing bytes to 64bit align the length */
+       if (len & 7) {
+               unsigned int tail_len = len & 7;
+               buff -= tail_len;
+               if (likely(tail_len == 4))
+                       sum += *(u32 *)buff;
+               else
+                       /* Mask off the unwanted high bytes */
+                       sum += *(u64 *)buff & ~(~0ull << (tail_len * 8));
        }
-       if (len & 1)
-               result += *buff;
-       result = add32_with_carry(result>>32, result & 0xffffffff); 
-       if (unlikely(odd)) { 
-               result = from32to16(result);
-               result = ((result >> 8) & 0xff) | ((result & 0xff) << 8);
-       }
-       return result;
+
+       /* Align and negate len so that we need to sum [buff[len]..buf[0]) */
+       len = -(len & ~15);
+
+       /*
+        * Align the byte count to a multiple of 16 then
+        * add 64 bit words to alternating registers.
+        * Finally reduce to 64 bits.
+        */
+       asm(    "       bt    $4, %[len]\n"
+               "       jnc   10f\n"
+               "       add   (%[buff], %[len]), %[sum_0]\n"
+               "       adc   8(%[buff], %[len]), %[sum_1]\n"
+               "       lea   16(%[len]), %[len]\n"
+               "10:    jecxz 20f\n"
+               "       adc   (%[buff], %[len]), %[sum_0]\n"
+               "       adc   8(%[buff], %[len]), %[sum_1]\n"
+               "       lea   32(%[len]), %[len_tmp]\n"
+               "       adc   16(%[buff], %[len]), %[sum_0]\n"
+               "       adc   24(%[buff], %[len]), %[sum_1]\n"
+               "       mov   %[len_tmp], %[len]\n"
+               "       jmp   10b\n"
+               "20:    adc   %[sum_0], %[sum]\n"
+               "       adc   %[sum_1], %[sum]\n"
+               "       adc   $0, %[sum]\n"
+           : [sum] "+&r" (sum), [sum_0] "+&r" (sum_0), [sum_1] "+&r" (sum_1),
+               [len] "+&c" (len), [len_tmp] "=&r" (len_tmp)
+           : [buff] "r" (buff)
+           : "memory" );
+
+reduce_32:
+       sum = add32_with_carry(sum>>32, sum & 0xffffffff); 
+
+       if (unlikely(odd))
+               return __builtin_bswap32(sum);
+
+       return sum;
 }
 
 /*
@@ -133,8 +138,7 @@ static unsigned do_csum(const unsigned char *buff, unsigned 
len)
  */
 __wsum csum_partial(const void *buff, int len, __wsum sum)
 {
-       return (__force __wsum)add32_with_carry(do_csum(buff, len),
-                                               (__force u32)sum);
+       return (__force __wsum)do_csum(buff, len, (__force u32)sum);
 }
 EXPORT_SYMBOL(csum_partial);
 
-- 
1.8.1.2

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, 
UK
Registration No: 1397386 (Wales)

[PATCH v2] x86: Optimise x86 IP checksum code

Reply via email to