https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105513
--- Comment #8 from Hongtao.liu <crazylht at gmail dot com> ---
(In reply to Alexander Monakov from comment #7)
> The second sequence is 3 uops vs 1/2 (issued/executed) uops in first, and on
> Haswell and Skylake it ties up port 5 for two cycles.
>
> Unclear if you're microbenchmarking latency or throughput, but in any case
> on Haswell and Skylake you should see a close to 2x difference.
I'm counting clocksticks, and thought a load may take more latency.
#include <stdio.h>
#include <stdlib.h>
#include <x86intrin.h>
#define LOOP 1000000000
typedef long v2di __attribute__((vector_size(16)));
typedef int v4si __attribute__((vector_size(16)));
v2di
__attribute__ ((noipa))
foo (v2di a)
{
a[1] = 111113;
return a;
}
void
__attribute__ ((noipa))
foo1 (v2di a)
{
}
int
main ()
{
int i;
unsigned long long start, end;
unsigned long long diff;
unsigned int aux;
start = __rdtscp (&aux);
v2di b = __extension__ (v2di){111, 222};
for (i = 0; i < LOOP; i++)
{
v2di a = foo (b);
foo1 (a);
}
end = __rdtscp (&aux);
diff = end - start;
printf ("alterna: %lld\n", diff);
return 0;
}