On 22/02/14 22:15, Nathaniel Smith wrote:

$ make TARGET=SANDYBRIDGE USE_OPENMP=0 BINARY=64 NOFORTRAN=1

You'll definitely want to disable the affinity support too, and
probably memory warmup. And possibly increase the maximum thread
count, unless you'll only use the library on the computer it was built
on. And maybe other things. The OpenBLAS build process has so many
ways to accidentally impale yourself, it's an object lesson in why
building regulations are a good thing.

Thanks for the advice.

Right now I am just testing on my own computer.

cblas_dgemm is running roughly 50 % faster with OpenBLAS than MKL 11.1 update 2, sometimes OpenBLAS is twice as fast as MKL.

WTF???

:-D

Ok, next runner up is Accelerate. Let's see how it compares to OpenBLAS and MKL on Mavericks.


Sturla


#include <mach/mach.h>
#include <mach/mach_time.h>
#include <stdlib.h>
#include "mkl.h"

double nanodiff(const uint64_t _t0, const uint64_t _t1)
{   
    long double t0, t1, numer, denom, nanosec;
    mach_timebase_info_data_t tb_info;
    mach_timebase_info(&tb_info);
    numer = (long double)(tb_info.numer);
    denom = (long double)(tb_info.denom);    
    t0 = (long double)(_t0);
    t1 = (long double)(_t1);
    nanosec = (t1 - t0) * numer / denom;
    return (double)nanosec;
}

int main(int argc, char **argv)
{
    const int BOUNDARY = 64;
    long double nanosec;
    int n = 512;
    int m = n, k = n;
    double *A = (double*)mkl_malloc(n*n*sizeof(double), BOUNDARY); 
    double *B = (double*)mkl_malloc(n*n*sizeof(double), BOUNDARY); 
    double *C = (double*)mkl_malloc(n*n*sizeof(double), BOUNDARY);
    uint64_t t0, t1;
    
    t0 = mach_absolute_time();
    
    cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
           m, n, k, 1.0, A, k, B, n, 1.0, C, n);

    t1 = mach_absolute_time();
    
    nanosec = nanodiff(t0, t1);
    
    printf("elapsed time: %g ns\n", (double)nanosec);

    mkl_free(A); mkl_free(B); mkl_free(C);
}


#include <mach/mach.h>
#include <mach/mach_time.h>
#include <stdlib.h>
#include <cblas.h>

double nanodiff(const uint64_t _t0, const uint64_t _t1)
{   
    long double t0, t1, numer, denom, nanosec;
    mach_timebase_info_data_t tb_info;
    mach_timebase_info(&tb_info);
    numer = (long double)(tb_info.numer);
    denom = (long double)(tb_info.denom);    
    t0 = (long double)(_t0);
    t1 = (long double)(_t1);
    nanosec = (t1 - t0) * numer / denom;
    return (double)nanosec;
}

int main(int argc, char **argv)
{
    long double nanosec;
    int n = 512;
    int m = n, k = n;
    double *A = (double*)malloc(n*n*sizeof(double)); 
    double *B = (double*)malloc(n*n*sizeof(double)); 
    double *C = (double*)malloc(n*n*sizeof(double));
    uint64_t t0, t1;
    
    t0 = mach_absolute_time();
    
    cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
           m, n, k, 1.0, A, k, B, n, 1.0, C, n);

    t1 = mach_absolute_time();
    
    nanosec = nanodiff(t0, t1);
    
    printf("elapsed time: %g ns\n", (double)nanosec);

    free(A); free(B); free(C);
}


_______________________________________________
NumPy-Discussion mailing list
NumPy-Discussion@scipy.org
http://mail.scipy.org/mailman/listinfo/numpy-discussion

Reply via email to