https://gcc.gnu.org/bugzilla/show_bug.cgi?id=95488

--- Comment #5 from Hongtao.liu <crazylht at gmail dot com> ---
Microbenchmark
----
cat test.c

#include <stdio.h>
#include <stdlib.h>
#include <x86intrin.h>

typedef char  v16qi  __attribute__ ((vector_size (16)));
extern v16qi interleave_mul (v16qi, v16qi);
extern v16qi extend_mul (v16qi, v16qi);

#define LOOP 30000000


int
main ()
{
  int i;
  unsigned long long start, end;
  unsigned long long diff;
  unsigned int aux;
  v16qi *p0;
  v16qi *p1;
  v16qi x, y;

  p0 = (v16qi *) malloc (LOOP *  sizeof (*p0));
  p1 = (v16qi *) malloc (LOOP *  sizeof (*p1));
  for (i = 0; i < LOOP; i++)
    for (int j = 0; j != 16; j++)
    {
      p0[i][j] = 1 + i + j;
      p1[i][j] = 1 + i * i + j * j;
    }

#if 1
  start = __rdtscp (&aux);
  for (i = 0; i < LOOP; i+=16)
    y = interleave_mul (p0[i], p1[i]);
  end = __rdtscp (&aux);
  diff = end - start;

  printf ("interleave_mul : %lld\n", diff);

#endif

#if 1
  start = __rdtscp (&aux);
  for (i = 0; i < LOOP; i+=16)
    x = extend_mul (p0[i], p1[i]);
  end = __rdtscp (&aux);
  diff = end - start;

  printf ("extend_mul :    %lld\n", diff);
#endif

  free (p0);
  free (p1);

  return 0;
}
---
show a little bit improvement:

interleave_mul : 104180000
extend_mul :    103922083

Reply via email to