Richard Guenther <richard.guent...@gmail.com> wrote on 2010/02/14 19:05:24: > > On Sun, Feb 14, 2010 at 5:51 PM, Joakim Tjernlund > <joakim.tjernl...@transmode.se> wrote: > > > > Noticed while optimizing crc16 that gcc -O performed much better > > than gcc -O2 while doing crc16: > > Reducing the noise by adding a loop with trip count 64, making sure > my powersaving model is fixed at performance I see > > -O1: > crc1:f532 crc2:f532 > crc16 tv_res:1 :387072 > CRC16 tv_res:1 :100397 > > -O2: > crc1:f532 crc2:f532 > crc16 tv_res:1 :301706 > CRC16 tv_res:1 :77103
The new CRC16 seems a lot faster :) > > so it's faster, with GCC 4.4.3. > > It's indeed slower with GCC 4.3.4 though. > > But your benchmark seems artificial enough that GCC 4.5 optimizes > it away - it manages to see that CRC16 and crc16 are pure functions, > thus it only retains their last calls. At least at -O1, at -O2 it inlines > all functions into main and isn't that clever anymore in the end. > > So - beware of benchmarks. the warmup isn't really needed after I added memset and inline or not should not matter that much are only used once so I think my conclusion still stands: gcc 4.3.4 is slower with -O2 than -O1 Glad to hear that newer versions are back on track, does than mean that this won't be fixed in gcc 4.3.x series? > > Richard. > > > # > gcc -O1 CRC16.c ;./a.out > > crc1:f532 crc2:f532 > > crc16 tv_res:0 :12768 > > CRC16 tv_res:0 :10795 > > # > gcc -O2 CRC16.c ;./a.out > > crc1:f532 crc2:f532 > > crc16 tv_res:0 :17092 > > CRC16 tv_res:0 :11581 > > > > #> gcc --version > > gcc (Gentoo 4.3.4 p1.0, pie-10.1.5) 4.3.4 > > > > cpu: > > vendor_id : GenuineIntel > > cpu family : 6 > > model : 23 > > model name : Intel(R) Core(TM)2 Duo CPU E8500 @ 3.16GHz > > stepping : 10 > > cpu MHz : 3159.236 > > > > Here is the CRC16.c: > > > > #define u32 unsigned long > > #define u16 unsigned short > > #define u8 unsigned char > > > > #include <sys/time.h> > > > > #include <stdio.h> > > #include <stdlib.h> > > > > /** CRC table for the CRC-16. The poly is 0x8005 (x^16 + x^15 + x^2 + 1) */ > > u16 const crc16_table[256] = { > > 0x0000, 0xC0C1, 0xC181, 0x0140, 0xC301, 0x03C0, 0x0280, 0xC241, > > 0xC601, 0x06C0, 0x0780, 0xC741, 0x0500, 0xC5C1, 0xC481, 0x0440, > > 0xCC01, 0x0CC0, 0x0D80, 0xCD41, 0x0F00, 0xCFC1, 0xCE81, 0x0E40, > > 0x0A00, 0xCAC1, 0xCB81, 0x0B40, 0xC901, 0x09C0, 0x0880, 0xC841, > > 0xD801, 0x18C0, 0x1980, 0xD941, 0x1B00, 0xDBC1, 0xDA81, 0x1A40, > > 0x1E00, 0xDEC1, 0xDF81, 0x1F40, 0xDD01, 0x1DC0, 0x1C80, 0xDC41, > > 0x1400, 0xD4C1, 0xD581, 0x1540, 0xD701, 0x17C0, 0x1680, 0xD641, > > 0xD201, 0x12C0, 0x1380, 0xD341, 0x1100, 0xD1C1, 0xD081, 0x1040, > > 0xF001, 0x30C0, 0x3180, 0xF141, 0x3300, 0xF3C1, 0xF281, 0x3240, > > 0x3600, 0xF6C1, 0xF781, 0x3740, 0xF501, 0x35C0, 0x3480, 0xF441, > > 0x3C00, 0xFCC1, 0xFD81, 0x3D40, 0xFF01, 0x3FC0, 0x3E80, 0xFE41, > > 0xFA01, 0x3AC0, 0x3B80, 0xFB41, 0x3900, 0xF9C1, 0xF881, 0x3840, > > 0x2800, 0xE8C1, 0xE981, 0x2940, 0xEB01, 0x2BC0, 0x2A80, 0xEA41, > > 0xEE01, 0x2EC0, 0x2F80, 0xEF41, 0x2D00, 0xEDC1, 0xEC81, 0x2C40, > > 0xE401, 0x24C0, 0x2580, 0xE541, 0x2700, 0xE7C1, 0xE681, 0x2640, > > 0x2200, 0xE2C1, 0xE381, 0x2340, 0xE101, 0x21C0, 0x2080, 0xE041, > > 0xA001, 0x60C0, 0x6180, 0xA141, 0x6300, 0xA3C1, 0xA281, 0x6240, > > 0x6600, 0xA6C1, 0xA781, 0x6740, 0xA501, 0x65C0, 0x6480, 0xA441, > > 0x6C00, 0xACC1, 0xAD81, 0x6D40, 0xAF01, 0x6FC0, 0x6E80, 0xAE41, > > 0xAA01, 0x6AC0, 0x6B80, 0xAB41, 0x6900, 0xA9C1, 0xA881, 0x6840, > > 0x7800, 0xB8C1, 0xB981, 0x7940, 0xBB01, 0x7BC0, 0x7A80, 0xBA41, > > 0xBE01, 0x7EC0, 0x7F80, 0xBF41, 0x7D00, 0xBDC1, 0xBC81, 0x7C40, > > 0xB401, 0x74C0, 0x7580, 0xB541, 0x7700, 0xB7C1, 0xB681, 0x7640, > > 0x7200, 0xB2C1, 0xB381, 0x7340, 0xB101, 0x71C0, 0x7080, 0xB041, > > 0x5000, 0x90C1, 0x9181, 0x5140, 0x9301, 0x53C0, 0x5280, 0x9241, > > 0x9601, 0x56C0, 0x5780, 0x9741, 0x5500, 0x95C1, 0x9481, 0x5440, > > 0x9C01, 0x5CC0, 0x5D80, 0x9D41, 0x5F00, 0x9FC1, 0x9E81, 0x5E40, > > 0x5A00, 0x9AC1, 0x9B81, 0x5B40, 0x9901, 0x59C0, 0x5880, 0x9841, > > 0x8801, 0x48C0, 0x4980, 0x8941, 0x4B00, 0x8BC1, 0x8A81, 0x4A40, > > 0x4E00, 0x8EC1, 0x8F81, 0x4F40, 0x8D01, 0x4DC0, 0x4C80, 0x8C41, > > 0x4400, 0x84C1, 0x8581, 0x4540, 0x8701, 0x47C0, 0x4680, 0x8641, > > 0x8201, 0x42C0, 0x4380, 0x8341, 0x4100, 0x81C1, 0x8081, 0x4040 > > }; > > #include <asm/byteorder.h> > > #define tole(x) __constant_cpu_to_le16(x) > > u16 const crc16_table_le[256] = { > > tole(0x0000), tole(0xC0C1), tole(0xC181), tole(0x0140), tole(0xC301), > > tole(0x03C0), tole(0x0280), tole(0xC241), tole(0xC601), tole(0x06C0), > > tole(0x0780), tole(0xC741), tole(0x0500), tole(0xC5C1), tole(0xC481), > > tole(0x0440), tole(0xCC01), tole(0x0CC0), tole(0x0D80), tole(0xCD41), > > tole(0x0F00), tole(0xCFC1), tole(0xCE81), tole(0x0E40), tole(0x0A00), > > tole(0xCAC1), tole(0xCB81), tole(0x0B40), tole(0xC901), tole(0x09C0), > > tole(0x0880), tole(0xC841), tole(0xD801), tole(0x18C0), tole(0x1980), > > tole(0xD941), tole(0x1B00), tole(0xDBC1), tole(0xDA81), tole(0x1A40), > > tole(0x1E00), tole(0xDEC1), tole(0xDF81), tole(0x1F40), tole(0xDD01), > > tole(0x1DC0), tole(0x1C80), tole(0xDC41), tole(0x1400), tole(0xD4C1), > > tole(0xD581), tole(0x1540), tole(0xD701), tole(0x17C0), tole(0x1680), > > tole(0xD641), tole(0xD201), tole(0x12C0), tole(0x1380), tole(0xD341), > > tole(0x1100), tole(0xD1C1), tole(0xD081), tole(0x1040), tole(0xF001), > > tole(0x30C0), tole(0x3180), tole(0xF141), tole(0x3300), tole(0xF3C1), > > tole(0xF281), tole(0x3240), tole(0x3600), tole(0xF6C1), tole(0xF781), > > tole(0x3740), tole(0xF501), tole(0x35C0), tole(0x3480), tole(0xF441), > > tole(0x3C00), tole(0xFCC1), tole(0xFD81), tole(0x3D40), tole(0xFF01), > > tole(0x3FC0), tole(0x3E80), tole(0xFE41), tole(0xFA01), tole(0x3AC0), > > tole(0x3B80), tole(0xFB41), tole(0x3900), tole(0xF9C1), tole(0xF881), > > tole(0x3840), tole(0x2800), tole(0xE8C1), tole(0xE981), tole(0x2940), > > tole(0xEB01), tole(0x2BC0), tole(0x2A80), tole(0xEA41), tole(0xEE01), > > tole(0x2EC0), tole(0x2F80), tole(0xEF41), tole(0x2D00), tole(0xEDC1), > > tole(0xEC81), tole(0x2C40), tole(0xE401), tole(0x24C0), tole(0x2580), > > tole(0xE541), tole(0x2700), tole(0xE7C1), tole(0xE681), tole(0x2640), > > tole(0x2200), tole(0xE2C1), tole(0xE381), tole(0x2340), tole(0xE101), > > tole(0x21C0), tole(0x2080), tole(0xE041), tole(0xA001), tole(0x60C0), > > tole(0x6180), tole(0xA141), tole(0x6300), tole(0xA3C1), tole(0xA281), > > tole(0x6240), tole(0x6600), tole(0xA6C1), tole(0xA781), tole(0x6740), > > tole(0xA501), tole(0x65C0), tole(0x6480), tole(0xA441), tole(0x6C00), > > tole(0xACC1), tole(0xAD81), tole(0x6D40), tole(0xAF01), tole(0x6FC0), > > tole(0x6E80), tole(0xAE41), tole(0xAA01), tole(0x6AC0), tole(0x6B80), > > tole(0xAB41), tole(0x6900), tole(0xA9C1), tole(0xA881), tole(0x6840), > > tole(0x7800), tole(0xB8C1), tole(0xB981), tole(0x7940), tole(0xBB01), > > tole(0x7BC0), tole(0x7A80), tole(0xBA41), tole(0xBE01), tole(0x7EC0), > > tole(0x7F80), tole(0xBF41), tole(0x7D00), tole(0xBDC1), tole(0xBC81), > > tole(0x7C40), tole(0xB401), tole(0x74C0), tole(0x7580), tole(0xB541), > > tole(0x7700), tole(0xB7C1), tole(0xB681), tole(0x7640), tole(0x7200), > > tole(0xB2C1), tole(0xB381), tole(0x7340), tole(0xB101), tole(0x71C0), > > tole(0x7080), tole(0xB041), tole(0x5000), tole(0x90C1), tole(0x9181), > > tole(0x5140), tole(0x9301), tole(0x53C0), tole(0x5280), tole(0x9241), > > tole(0x9601), tole(0x56C0), tole(0x5780), tole(0x9741), tole(0x5500), > > tole(0x95C1), tole(0x9481), tole(0x5440), tole(0x9C01), tole(0x5CC0), > > tole(0x5D80), tole(0x9D41), tole(0x5F00), tole(0x9FC1), tole(0x9E81), > > tole(0x5E40), tole(0x5A00), tole(0x9AC1), tole(0x9B81), tole(0x5B40), > > tole(0x9901), tole(0x59C0), tole(0x5880), tole(0x9841), tole(0x8801), > > tole(0x48C0), tole(0x4980), tole(0x8941), tole(0x4B00), tole(0x8BC1), > > tole(0x8A81), tole(0x4A40), tole(0x4E00), tole(0x8EC1), tole(0x8F81), > > tole(0x4F40), tole(0x8D01), tole(0x4DC0), tole(0x4C80), tole(0x8C41), > > tole(0x4400), tole(0x84C1), tole(0x8581), tole(0x4540), tole(0x8701), > > tole(0x47C0), tole(0x4680), tole(0x8641), tole(0x8201), tole(0x42C0), > > tole(0x4380), tole(0x8341), tole(0x4100), tole(0x81C1), tole(0x8081), > > tole(0x4040) > > }; > > > > extern u16 const crc16_table[256]; > > > > extern u16 crc16(u16 crc, const u8 *buffer, u32 len); > > > > static inline u16 crc16_byte(u16 crc, const u8 data) > > { > > return (crc >> 8) ^ crc16_table[(crc ^ data) & 0xff]; > > } > > > > /** > > * crc16 - compute the CRC-16 for the data buffer > > * @crc: previous CRC value > > * @buffer: data pointer > > * @len: number of bytes in the buffer > > * > > * Returns the updated CRC value. > > */ > > u16 crc16(u16 crc, u8 const *buffer, u32 len) > > { > > while (len--) > > crc = crc16_byte(crc, *buffer++); > > return crc; > > } > > #include <endian.h> > > # if __BYTE_ORDER == __LITTLE_ENDIAN > > # define DO_CRC16(x) crc = tab[(crc ^ (x)) & 255] ^ (crc >> 8) > > # else > > # define DO_CRC16(x) crc = tab[((crc >> 8) ^ (x))] ^ (crc << 8) > > # endif > > > > static inline u16 CRC16_byte(u16 crc, const u8 data) > > { > > const u16 *tab = crc16_table_le; > > > > crc = __cpu_to_le16(crc); > > DO_CRC16(data); > > return __le16_to_cpu(crc); > > } > > > > u16 CRC16(u16 crc, u8 const *buffer, u32 len) > > { > > u32 loops; > > u16 *b; > > const u16 *tab = crc16_table_le;; > > > > crc = __cpu_to_le16(crc); > > /* Align */ > > if ((long)buffer & 1 && len) { > > DO_CRC16(*buffer++); > > --len; > > } > > loops = len >> 1; > > b = (u16 *)buffer; > > for(--b; loops; --loops) { > > crc ^= *++b; /* use pre increment for speed */ > > DO_CRC16(0); > > DO_CRC16(0); > > } > > if (len & 1) { > > u8 *p = (u8 *)(b + 1); > > DO_CRC16(*p); > > } > > return __le16_to_cpu(crc); > > } > > #define TST "1234567890" > > #define BUF_SIZ 5*1024*1024 > > main() > > { > > u16 crc1, crc2; > > char *buffer = malloc(BUF_SIZ); > > struct timeval tv, tv2, tv3, tv_res, tv2_res; > > > > memset(buffer, -1, BUF_SIZ); > > memcpy (buffer, TST, sizeof(TST)); > > /*warm up */ > > crc1 = crc16(~0, buffer, BUF_SIZ); > > crc2 = CRC16(~0, buffer, BUF_SIZ); > > > > gettimeofday(&tv, NULL); > > crc1 = crc16(~0, buffer, BUF_SIZ); > > gettimeofday(&tv2, NULL); > > crc2 = CRC16(~0, buffer, BUF_SIZ); > > gettimeofday(&tv3, NULL); > > timersub(&tv2, &tv, &tv_res); > > printf("crc1:%x crc2:%x\n", crc1, crc2); > > printf("crc16 tv_res:%d :%d\n", (int)tv_res.tv_sec, > > (int)tv_res.tv_usec); > > timersub(&tv3, &tv2, &tv2_res); > > printf("CRC16 tv_res:%d :%d\n", (int)tv2_res.tv_sec, > > (int)tv2_res.tv_usec); > > } > > > > >