https://gcc.gnu.org/bugzilla/show_bug.cgi?id=71414
--- Comment #4 from Yichao Yu <yyc1992 at gmail dot com> ---
The C code is in the gist linked `a` is a cacheline aligned pointer and `n` is
1024 so `a` should even fits in L1d, which is 32kB on both processors I
benchmarked.
More precise timing (ns per loop)
6700K
```
% ./benchmark-gcc
80.553456
% ./benchmark-clang37
28.222281
% ./benchmark-clang38
41.782532
```
4702HQ
```
% ./benchmark-gcc
140.744893
% ./benchmark-clang37
50.835441
% ./benchmark-clang38
70.220946
```
Pasting the whole program over for completeness.
The alignment line gives some weird timing on clang without `-mcore-avx2` but
doesn't change anything too much with `-Ofast -mcore-avx2`
```
//
#include <stdlib.h>
#include <stdint.h>
#include <time.h>
#include <stdio.h>
#include <string.h>
uint64_t gettime_ns()
{
struct timespec t;
clock_gettime(CLOCK_MONOTONIC, &t);
return t.tv_sec * (uint64_t) 1e9 + t.tv_nsec;
}
__attribute__((noinline)) float sum32(float *a, size_t n)
{
/* a = (float*)__builtin_assume_aligned(a, 64); */
float s = 0;
for (size_t i = 0;i < n;i++)
s += a[i];
__asm__ volatile ("" ::: "memory");
return s;
}
int main()
{
float *p = aligned_alloc(64, sizeof(float) * 1024);
memset(p, 0, sizeof(float) * 1024);
uint64_t start = gettime_ns();
for (int i = 0;i < 1024 * 1024;i++)
sum32(p, 1024);
free(p);
uint64_t end = gettime_ns();
printf("%f\n", (end - start) / (1024.0 * 1024.0));
return 0;
}
```