speed of double-precision divide

Steve White Sat, 23 Jan 2010 08:47:11 -0800

Hi,

I recently revised some speed tests of basic CPU operations. 
There were a few surprises, but one was that, a test of double-precision
divide was a factor of ten slower when compiled with gcc than with the
Intel compiler icc.


This was with full optimization turned on, with an Intel Duo (Yonah)
processor.

I figured gcc was simply not using SSE2, and icc was.

But that is not the case at all.  While gcc produces apparently SSE2
assembler, icc does something quite different.

What's going on?

Find the .c file attached.  Assembler snippets follow.

------------
gcc has this (gcc -std=c99 -O3 -msse2 -mfpmath=sse -lm -S dt.c)
------------
.L27:
        movapd  (%esi,%eax), %xmm3       ;move 2 dbls at *(esi+eax) to xmm3
        divpd   192(%esp,%eax), %xmm3    ;(192 is xmm2) *(esp+eax), result->xmm3
        movapd  %xmm3, (%esi,%eax)       ;move 2 dbls from xmm3 back
        addl    $16, %eax                ;add 16 (len of 2 doubles) to eax
        cmpl    $16384, %eax             ;compare eax to 1024 * 16
        jne     .L27                     ;if not equal, do it again

------------
icc has this (icc -Wall -w2 -fast -c dt.c)
------------
                                # LOE eax xmm2
..B1.69:                        # Preds ..B1.71 ..B1.68
        movsd     8336(%esp,%eax,8), %xmm1                      #108.30
        movsd     _2il0floatpacket.13, %xmm0                    #108.2
        divsd     24720(%esp,%eax,8), %xmm0                     #108.2
        unpcklpd  %xmm2, %xmm1                                  #108.30
        xorl      %edx, %edx                                    #
        movddup   %xmm0, %xmm0                                  #108.2
        movddup   %xmm0, %xmm0                                  #108.2
                                # LOE eax edx xmm0 xmm1 xmm2
..B1.70:                        # Preds ..B1.70 ..B1.69
        mulpd     %xmm0, %xmm1                                  #108.2
        mulpd     %xmm0, %xmm1                                  #108.2
        mulpd     %xmm0, %xmm1                                  #108.2
        mulpd     %xmm0, %xmm1                                  #108.2
        addl      $8, %edx                                      #
        cmpl      $131072, %edx                                 #108.2
        jb        ..B1.70       # Prob 99%                      #108.2


-- 
| -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -
| Steve White                                             +49(331)7499-202
| e-Science / AstroGrid-D                                   Zi. 35  Bg. 20
| -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -
| Astrophysikalisches Institut Potsdam (AIP)
| An der Sternwarte 16, D-14482 Potsdam
|
| Vorstand: Prof. Dr. Matthias Steinmetz, Peter A. Stolz
|
| Stiftung privaten Rechts, Stiftungsverzeichnis Brandenburg: III/7-71-026
| -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -

#include <sys/time.h>
#include <stdio.h>
#include <stdlib.h>
#include <math.h>

#include <sys/resource.h>
enum {
        ITERATIONS = 131072,
        size = 2048
};
inline void
double_array_divs_variable( double * restrict dvec1, double * restrict dvec2 )
{
        long    i, j;

        for( j = 0; j < ITERATIONS; j++ )
                for( i = 0; i < size; i++ )
                        dvec1[i] /= dvec2[i];
}
static const int who = RUSAGE_SELF;
static struct rusage local;
static time_t tv_sec;
static long tv_usec;

void START_CLOCK()
{
        getrusage( who, &local );
}
long
MS_SINCE()
{
        return tv_usec = local.ru_utime.tv_usec, tv_sec = local.ru_utime.tv_sec,
                        getrusage( who, &local),
                        (long)( ( local.ru_utime.tv_sec - tv_sec ) * 1000
                        + ( local.ru_utime.tv_usec - tv_usec ) / 1000 );
}

int
main( int argc, char *argv[] )
{
        double  *dvec1, *dvec2;
        const char      *compiler = NULL;
        long    i;

        printf( " SpeedTest >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>\n" );
#ifdef __INTEL_COMPILER
        compiler = "INTEL";
#elif defined( __PATHSCALE__ )
        compiler = "PathScale";
#elif defined( __PGI )
        compiler = "Portland Group";
#elif defined( __GNUC__)
        compiler = "Gnu gcc";
#endif
        posix_memalign( &dvec1, 16, size * sizeof(double) );
        posix_memalign( &dvec2, 16, size * sizeof(double) );

        printf( " C version" );
        if( compiler )
                printf( ", %s compiler ", compiler );
        printf( "\n" );
        printf( " size of int: %zu  size of long: %zu  size of double: %zu\n",
                sizeof( int ), sizeof( long ), sizeof( double ) );
        printf( " %i iterations of each test. ", ITERATIONS );
        printf( " inner loop / array size %i.\n", size );

        for( i = 0; i < size; i++ )
        {
                dvec1[i] = 1.0000001 * cosf((float)(size - i));
                dvec2[i] = 1.0  + 0.0000000001 * sinf((float)i);
        }

        START_CLOCK();
        double_array_divs_variable( dvec1, dvec2 );
        
        printf( "%-38s %4ld ms [%15.6e]\n",
                        "double array divs by var", MS_SINCE(), dvec1[0] );

        return 0;
}

speed of double-precision divide

Reply via email to