Hello On 14.02.10, you wrote:
> > Glad to hear that newer versions are back on track, does than mean > that this won't be fixed in gcc 4.3.x series? maybe you test the GCC 4.5.0 Version. I notice in zlib (used in PNG image compression) when there is a image that contain lots same pixel values a speedup of 30-50% was no X86 CPU but i post only to show, that GCC 4.5.0 have a speedup get. > >> >> Richard. >> >>> # > gcc -O1 CRC16.c ;./a.out >>> crc1:f532 crc2:f532 >>> crc16 tv_res:0 :12768 >>> CRC16 tv_res:0 :10795 >>> # > gcc -O2 CRC16.c ;./a.out >>> crc1:f532 crc2:f532 >>> crc16 tv_res:0 :17092 >>> CRC16 tv_res:0 :11581 >>> >>> #> gcc --version >>> gcc (Gentoo 4.3.4 p1.0, pie-10.1.5) 4.3.4 >>> >>> cpu: >>> vendor_id : GenuineIntel >>> cpu family : 6 >>> model : 23 >>> model name : Intel(R) Core(TM)2 Duo CPU E8500 @ 3.16GHz >>> stepping : 10 >>> cpu MHz : 3159.236 >>> >>> Here is the CRC16.c: >>> >>> #define u32 unsigned long >>> #define u16 unsigned short >>> #define u8 unsigned char >>> >>> #include <sys/time.h> >>> >>> #include <stdio.h> >>> #include <stdlib.h> >>> >>> /** CRC table for the CRC-16. The poly is 0x8005 (x^16 + x^15 + x^2 + >> 1) */ >>> u16 const crc16_table[256] = { >>> 0x0000, 0xC0C1, 0xC181, 0x0140, 0xC301, 0x03C0, 0x0280, 0xC241, >>> 0xC601, 0x06C0, 0x0780, 0xC741, 0x0500, 0xC5C1, 0xC481, 0x0440, >>> 0xCC01, 0x0CC0, 0x0D80, 0xCD41, 0x0F00, 0xCFC1, 0xCE81, 0x0E40, >>> 0x0A00, 0xCAC1, 0xCB81, 0x0B40, 0xC901, 0x09C0, 0x0880, 0xC841, >>> 0xD801, 0x18C0, 0x1980, 0xD941, 0x1B00, 0xDBC1, 0xDA81, 0x1A40, >>> 0x1E00, 0xDEC1, 0xDF81, 0x1F40, 0xDD01, 0x1DC0, 0x1C80, 0xDC41, >>> 0x1400, 0xD4C1, 0xD581, 0x1540, 0xD701, 0x17C0, 0x1680, 0xD641, >>> 0xD201, 0x12C0, 0x1380, 0xD341, 0x1100, 0xD1C1, 0xD081, 0x1040, >>> 0xF001, 0x30C0, 0x3180, 0xF141, 0x3300, 0xF3C1, 0xF281, 0x3240, >>> 0x3600, 0xF6C1, 0xF781, 0x3740, 0xF501, 0x35C0, 0x3480, 0xF441, >>> 0x3C00, 0xFCC1, 0xFD81, 0x3D40, 0xFF01, 0x3FC0, 0x3E80, 0xFE41, >>> 0xFA01, 0x3AC0, 0x3B80, 0xFB41, 0x3900, 0xF9C1, 0xF881, 0x3840, >>> 0x2800, 0xE8C1, 0xE981, 0x2940, 0xEB01, 0x2BC0, 0x2A80, 0xEA41, >>> 0xEE01, 0x2EC0, 0x2F80, 0xEF41, 0x2D00, 0xEDC1, 0xEC81, 0x2C40, >>> 0xE401, 0x24C0, 0x2580, 0xE541, 0x2700, 0xE7C1, 0xE681, 0x2640, >>> 0x2200, 0xE2C1, 0xE381, 0x2340, 0xE101, 0x21C0, 0x2080, 0xE041, >>> 0xA001, 0x60C0, 0x6180, 0xA141, 0x6300, 0xA3C1, 0xA281, 0x6240, >>> 0x6600, 0xA6C1, 0xA781, 0x6740, 0xA501, 0x65C0, 0x6480, 0xA441, >>> 0x6C00, 0xACC1, 0xAD81, 0x6D40, 0xAF01, 0x6FC0, 0x6E80, 0xAE41, >>> 0xAA01, 0x6AC0, 0x6B80, 0xAB41, 0x6900, 0xA9C1, 0xA881, 0x6840, >>> 0x7800, 0xB8C1, 0xB981, 0x7940, 0xBB01, 0x7BC0, 0x7A80, 0xBA41, >>> 0xBE01, 0x7EC0, 0x7F80, 0xBF41, 0x7D00, 0xBDC1, 0xBC81, 0x7C40, >>> 0xB401, 0x74C0, 0x7580, 0xB541, 0x7700, 0xB7C1, 0xB681, 0x7640, >>> 0x7200, 0xB2C1, 0xB381, 0x7340, 0xB101, 0x71C0, 0x7080, 0xB041, >>> 0x5000, 0x90C1, 0x9181, 0x5140, 0x9301, 0x53C0, 0x5280, 0x9241, >>> 0x9601, 0x56C0, 0x5780, 0x9741, 0x5500, 0x95C1, 0x9481, 0x5440, >>> 0x9C01, 0x5CC0, 0x5D80, 0x9D41, 0x5F00, 0x9FC1, 0x9E81, 0x5E40, >>> 0x5A00, 0x9AC1, 0x9B81, 0x5B40, 0x9901, 0x59C0, 0x5880, 0x9841, >>> 0x8801, 0x48C0, 0x4980, 0x8941, 0x4B00, 0x8BC1, 0x8A81, 0x4A40, >>> 0x4E00, 0x8EC1, 0x8F81, 0x4F40, 0x8D01, 0x4DC0, 0x4C80, 0x8C41, >>> 0x4400, 0x84C1, 0x8581, 0x4540, 0x8701, 0x47C0, 0x4680, 0x8641, >>> 0x8201, 0x42C0, 0x4380, 0x8341, 0x4100, 0x81C1, 0x8081, 0x4040 >>> }; >>> #include <asm/byteorder.h> >>> #define tole(x) __constant_cpu_to_le16(x) >>> u16 const crc16_table_le[256] = { >>> tole(0x0000), tole(0xC0C1), tole(0xC181), tole(0x0140), tole(0xC301), >>> tole(0x03C0), tole(0x0280), tole(0xC241), tole(0xC601), tole(0x06C0), >>> tole(0x0780), tole(0xC741), tole(0x0500), tole(0xC5C1), tole(0xC481), >>> tole(0x0440), tole(0xCC01), tole(0x0CC0), tole(0x0D80), tole(0xCD41), >>> tole(0x0F00), tole(0xCFC1), tole(0xCE81), tole(0x0E40), tole(0x0A00), >>> tole(0xCAC1), tole(0xCB81), tole(0x0B40), tole(0xC901), tole(0x09C0), >>> tole(0x0880), tole(0xC841), tole(0xD801), tole(0x18C0), tole(0x1980), >>> tole(0xD941), tole(0x1B00), tole(0xDBC1), tole(0xDA81), tole(0x1A40), >>> tole(0x1E00), tole(0xDEC1), tole(0xDF81), tole(0x1F40), tole(0xDD01), >>> tole(0x1DC0), tole(0x1C80), tole(0xDC41), tole(0x1400), tole(0xD4C1), >>> tole(0xD581), tole(0x1540), tole(0xD701), tole(0x17C0), tole(0x1680), >>> tole(0xD641), tole(0xD201), tole(0x12C0), tole(0x1380), tole(0xD341), >>> tole(0x1100), tole(0xD1C1), tole(0xD081), tole(0x1040), tole(0xF001), >>> tole(0x30C0), tole(0x3180), tole(0xF141), tole(0x3300), tole(0xF3C1), >>> tole(0xF281), tole(0x3240), tole(0x3600), tole(0xF6C1), tole(0xF781), >>> tole(0x3740), tole(0xF501), tole(0x35C0), tole(0x3480), tole(0xF441), >>> tole(0x3C00), tole(0xFCC1), tole(0xFD81), tole(0x3D40), tole(0xFF01), >>> tole(0x3FC0), tole(0x3E80), tole(0xFE41), tole(0xFA01), tole(0x3AC0), >>> tole(0x3B80), tole(0xFB41), tole(0x3900), tole(0xF9C1), tole(0xF881), >>> tole(0x3840), tole(0x2800), tole(0xE8C1), tole(0xE981), tole(0x2940), >>> tole(0xEB01), tole(0x2BC0), tole(0x2A80), tole(0xEA41), tole(0xEE01), >>> tole(0x2EC0), tole(0x2F80), tole(0xEF41), tole(0x2D00), tole(0xEDC1), >>> tole(0xEC81), tole(0x2C40), tole(0xE401), tole(0x24C0), tole(0x2580), >>> tole(0xE541), tole(0x2700), tole(0xE7C1), tole(0xE681), tole(0x2640), >>> tole(0x2200), tole(0xE2C1), tole(0xE381), tole(0x2340), tole(0xE101), >>> tole(0x21C0), tole(0x2080), tole(0xE041), tole(0xA001), tole(0x60C0), >>> tole(0x6180), tole(0xA141), tole(0x6300), tole(0xA3C1), tole(0xA281), >>> tole(0x6240), tole(0x6600), tole(0xA6C1), tole(0xA781), tole(0x6740), >>> tole(0xA501), tole(0x65C0), tole(0x6480), tole(0xA441), tole(0x6C00), >>> tole(0xACC1), tole(0xAD81), tole(0x6D40), tole(0xAF01), tole(0x6FC0), >>> tole(0x6E80), tole(0xAE41), tole(0xAA01), tole(0x6AC0), tole(0x6B80), >>> tole(0xAB41), tole(0x6900), tole(0xA9C1), tole(0xA881), tole(0x6840), >>> tole(0x7800), tole(0xB8C1), tole(0xB981), tole(0x7940), tole(0xBB01), >>> tole(0x7BC0), tole(0x7A80), tole(0xBA41), tole(0xBE01), tole(0x7EC0), >>> tole(0x7F80), tole(0xBF41), tole(0x7D00), tole(0xBDC1), tole(0xBC81), >>> tole(0x7C40), tole(0xB401), tole(0x74C0), tole(0x7580), tole(0xB541), >>> tole(0x7700), tole(0xB7C1), tole(0xB681), tole(0x7640), tole(0x7200), >>> tole(0xB2C1), tole(0xB381), tole(0x7340), tole(0xB101), tole(0x71C0), >>> tole(0x7080), tole(0xB041), tole(0x5000), tole(0x90C1), tole(0x9181), >>> tole(0x5140), tole(0x9301), tole(0x53C0), tole(0x5280), tole(0x9241), >>> tole(0x9601), tole(0x56C0), tole(0x5780), tole(0x9741), tole(0x5500), >>> tole(0x95C1), tole(0x9481), tole(0x5440), tole(0x9C01), tole(0x5CC0), >>> tole(0x5D80), tole(0x9D41), tole(0x5F00), tole(0x9FC1), tole(0x9E81), >>> tole(0x5E40), tole(0x5A00), tole(0x9AC1), tole(0x9B81), tole(0x5B40), >>> tole(0x9901), tole(0x59C0), tole(0x5880), tole(0x9841), tole(0x8801), >>> tole(0x48C0), tole(0x4980), tole(0x8941), tole(0x4B00), tole(0x8BC1), >>> tole(0x8A81), tole(0x4A40), tole(0x4E00), tole(0x8EC1), tole(0x8F81), >>> tole(0x4F40), tole(0x8D01), tole(0x4DC0), tole(0x4C80), tole(0x8C41), >>> tole(0x4400), tole(0x84C1), tole(0x8581), tole(0x4540), tole(0x8701), >>> tole(0x47C0), tole(0x4680), tole(0x8641), tole(0x8201), tole(0x42C0), >>> tole(0x4380), tole(0x8341), tole(0x4100), tole(0x81C1), tole(0x8081), >>> tole(0x4040) >>> }; >>> >>> extern u16 const crc16_table[256]; >>> >>> extern u16 crc16(u16 crc, const u8 *buffer, u32 len); >>> >>> static inline u16 crc16_byte(u16 crc, const u8 data) >>> { >>> return (crc >> 8) ^ crc16_table[(crc ^ data) & 0xff]; >>> } >>> >>> /** >>> * crc16 - compute the CRC-16 for the data buffer >>> * @crc: previous CRC value >>> * @buffer: data pointer >>> * @len: number of bytes in the buffer >>> * >>> * Returns the updated CRC value. >>> */ >>> u16 crc16(u16 crc, u8 const *buffer, u32 len) >>> { >>> while (len--) >>> crc = crc16_byte(crc, *buffer++); >>> return crc; >>> } >>> #include <endian.h> >>> # if __BYTE_ORDER == __LITTLE_ENDIAN >>> # define DO_CRC16(x) crc = tab[(crc ^ (x)) & 255] ^ (crc >> 8) >>> # else >>> # define DO_CRC16(x) crc = tab[((crc >> 8) ^ (x))] ^ (crc << 8) >>> # endif >>> >>> static inline u16 CRC16_byte(u16 crc, const u8 data) >>> { >>> const u16 *tab = crc16_table_le; >>> >>> crc = __cpu_to_le16(crc); >>> DO_CRC16(data); >>> return __le16_to_cpu(crc); >>> } >>> >>> u16 CRC16(u16 crc, u8 const *buffer, u32 len) >>> { >>> u32 loops; >>> u16 *b; >>> const u16 *tab = crc16_table_le;; >>> >>> crc = __cpu_to_le16(crc); >>> /* Align */ >>> if ((long)buffer & 1 && len) { >>> DO_CRC16(*buffer++); >>> --len; >>> } >>> loops = len >> 1; >>> b = (u16 *)buffer; >>> for(--b; loops; --loops) { >>> crc ^= *++b; /* use pre increment for speed */ >>> DO_CRC16(0); >>> DO_CRC16(0); >>> } >>> if (len & 1) { >>> u8 *p = (u8 *)(b + 1); >>> DO_CRC16(*p); >>> } >>> return __le16_to_cpu(crc); >>> } >>> #define TST "1234567890" >>> #define BUF_SIZ 5*1024*1024 >>> main() >>> { >>> u16 crc1, crc2; >>> char *buffer = malloc(BUF_SIZ); >>> struct timeval tv, tv2, tv3, tv_res, tv2_res; >>> >>> memset(buffer, -1, BUF_SIZ); >>> memcpy (buffer, TST, sizeof(TST)); >>> /*warm up */ >>> crc1 = crc16(~0, buffer, BUF_SIZ); >>> crc2 = CRC16(~0, buffer, BUF_SIZ); >>> >>> gettimeofday(&tv, NULL); >>> crc1 = crc16(~0, buffer, BUF_SIZ); >>> gettimeofday(&tv2, NULL); >>> crc2 = CRC16(~0, buffer, BUF_SIZ); >>> gettimeofday(&tv3, NULL); >>> timersub(&tv2, &tv, &tv_res); >>> printf("crc1:%x crc2:%x\n", crc1, crc2); >>> printf("crc16 tv_res:%d :%d\n", (int)tv_res.tv_sec, >> (int)tv_res.tv_usec); >>> timersub(&tv3, &tv2, &tv2_res); >>> printf("CRC16 tv_res:%d :%d\n", (int)tv2_res.tv_sec, >> (int)tv2_res.tv_usec); >>> } >>> >>> >> > Regards