Hi, I recently revised some speed tests of basic CPU operations. There were a few surprises, but one was that, a test of double-precision divide was a factor of ten slower when compiled with gcc than with the Intel compiler icc.
This was with full optimization turned on, with an Intel Duo (Yonah) processor. I figured gcc was simply not using SSE2, and icc was. But that is not the case at all. While gcc produces apparently SSE2 assembler, icc does something quite different. What's going on? Find the .c file attached. Assembler snippets follow. ------------ gcc has this (gcc -std=c99 -O3 -msse2 -mfpmath=sse -lm -S dt.c) ------------ .L27: movapd (%esi,%eax), %xmm3 ;move 2 dbls at *(esi+eax) to xmm3 divpd 192(%esp,%eax), %xmm3 ;(192 is xmm2) *(esp+eax), result->xmm3 movapd %xmm3, (%esi,%eax) ;move 2 dbls from xmm3 back addl $16, %eax ;add 16 (len of 2 doubles) to eax cmpl $16384, %eax ;compare eax to 1024 * 16 jne .L27 ;if not equal, do it again ------------ icc has this (icc -Wall -w2 -fast -c dt.c) ------------ # LOE eax xmm2 ..B1.69: # Preds ..B1.71 ..B1.68 movsd 8336(%esp,%eax,8), %xmm1 #108.30 movsd _2il0floatpacket.13, %xmm0 #108.2 divsd 24720(%esp,%eax,8), %xmm0 #108.2 unpcklpd %xmm2, %xmm1 #108.30 xorl %edx, %edx # movddup %xmm0, %xmm0 #108.2 movddup %xmm0, %xmm0 #108.2 # LOE eax edx xmm0 xmm1 xmm2 ..B1.70: # Preds ..B1.70 ..B1.69 mulpd %xmm0, %xmm1 #108.2 mulpd %xmm0, %xmm1 #108.2 mulpd %xmm0, %xmm1 #108.2 mulpd %xmm0, %xmm1 #108.2 addl $8, %edx # cmpl $131072, %edx #108.2 jb ..B1.70 # Prob 99% #108.2 -- | - - - - - - - - - - - - - - - - - - - - - - - - - | Steve White +49(331)7499-202 | e-Science / AstroGrid-D Zi. 35 Bg. 20 | - - - - - - - - - - - - - - - - - - - - - - - - - | Astrophysikalisches Institut Potsdam (AIP) | An der Sternwarte 16, D-14482 Potsdam | | Vorstand: Prof. Dr. Matthias Steinmetz, Peter A. Stolz | | Stiftung privaten Rechts, Stiftungsverzeichnis Brandenburg: III/7-71-026 | - - - - - - - - - - - - - - - - - - - - - - - - -
#include <sys/time.h> #include <stdio.h> #include <stdlib.h> #include <math.h> #include <sys/resource.h> enum { ITERATIONS = 131072, size = 2048 }; inline void double_array_divs_variable( double * restrict dvec1, double * restrict dvec2 ) { long i, j; for( j = 0; j < ITERATIONS; j++ ) for( i = 0; i < size; i++ ) dvec1[i] /= dvec2[i]; } static const int who = RUSAGE_SELF; static struct rusage local; static time_t tv_sec; static long tv_usec; void START_CLOCK() { getrusage( who, &local ); } long MS_SINCE() { return tv_usec = local.ru_utime.tv_usec, tv_sec = local.ru_utime.tv_sec, getrusage( who, &local), (long)( ( local.ru_utime.tv_sec - tv_sec ) * 1000 + ( local.ru_utime.tv_usec - tv_usec ) / 1000 ); } int main( int argc, char *argv[] ) { double *dvec1, *dvec2; const char *compiler = NULL; long i; printf( " SpeedTest >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>\n" ); #ifdef __INTEL_COMPILER compiler = "INTEL"; #elif defined( __PATHSCALE__ ) compiler = "PathScale"; #elif defined( __PGI ) compiler = "Portland Group"; #elif defined( __GNUC__) compiler = "Gnu gcc"; #endif posix_memalign( &dvec1, 16, size * sizeof(double) ); posix_memalign( &dvec2, 16, size * sizeof(double) ); printf( " C version" ); if( compiler ) printf( ", %s compiler ", compiler ); printf( "\n" ); printf( " size of int: %zu size of long: %zu size of double: %zu\n", sizeof( int ), sizeof( long ), sizeof( double ) ); printf( " %i iterations of each test. ", ITERATIONS ); printf( " inner loop / array size %i.\n", size ); for( i = 0; i < size; i++ ) { dvec1[i] = 1.0000001 * cosf((float)(size - i)); dvec2[i] = 1.0 + 0.0000000001 * sinf((float)i); } START_CLOCK(); double_array_divs_variable( dvec1, dvec2 ); printf( "%-38s %4ld ms [%15.6e]\n", "double array divs by var", MS_SINCE(), dvec1[0] ); return 0; }