https://gcc.gnu.org/bugzilla/show_bug.cgi?id=95488
--- Comment #5 from Hongtao.liu <crazylht at gmail dot com> --- Microbenchmark ---- cat test.c #include <stdio.h> #include <stdlib.h> #include <x86intrin.h> typedef char v16qi __attribute__ ((vector_size (16))); extern v16qi interleave_mul (v16qi, v16qi); extern v16qi extend_mul (v16qi, v16qi); #define LOOP 30000000 int main () { int i; unsigned long long start, end; unsigned long long diff; unsigned int aux; v16qi *p0; v16qi *p1; v16qi x, y; p0 = (v16qi *) malloc (LOOP * sizeof (*p0)); p1 = (v16qi *) malloc (LOOP * sizeof (*p1)); for (i = 0; i < LOOP; i++) for (int j = 0; j != 16; j++) { p0[i][j] = 1 + i + j; p1[i][j] = 1 + i * i + j * j; } #if 1 start = __rdtscp (&aux); for (i = 0; i < LOOP; i+=16) y = interleave_mul (p0[i], p1[i]); end = __rdtscp (&aux); diff = end - start; printf ("interleave_mul : %lld\n", diff); #endif #if 1 start = __rdtscp (&aux); for (i = 0; i < LOOP; i+=16) x = extend_mul (p0[i], p1[i]); end = __rdtscp (&aux); diff = end - start; printf ("extend_mul : %lld\n", diff); #endif free (p0); free (p1); return 0; } --- show a little bit improvement: interleave_mul : 104180000 extend_mul : 103922083