https://gcc.gnu.org/bugzilla/show_bug.cgi?id=109812
--- Comment #10 from Jan Hubicka <hubicka at gcc dot gnu.org> ---
This is benchmarkeable version of the simplified testcase:
jan@localhost:/tmp> cat t.c
#define N 10000000
struct rgb {unsigned char r,g,b;} rgbs[N];
int *addr;
struct drgb {double r,g,b;
#ifdef OPACITY
double o;
#endif
};
struct drgb sum(double w)
{
struct drgb r;
for (int i = 0; i < N; i++)
{
r.r += rgbs[i].r * w;
r.g += rgbs[i].g * w;
r.b += rgbs[i].b * w;
}
return r;
}
jan@localhost:/tmp> cat q.c
struct drgb {double r,g,b;
#ifdef OPACITY
double o;
#endif
};
struct drgb sum(double w);
int
main()
{
for (int i = 0; i < 1000; i++)
sum(i);
}
jan@localhost:/tmp> gcc t.c q.c -march=native -O3 -g ; objdump -d a.out | grep
vfmadd231pd ; perf stat ./a.out
40119d: c4 e2 d9 b8 d1 vfmadd231pd %xmm1,%xmm4,%xmm2
Performance counter stats for './a.out':
12,148.04 msec task-clock:u # 1.000 CPUs
utilized
0 context-switches:u # 0.000 /sec
0 cpu-migrations:u # 0.000 /sec
736 page-faults:u # 60.586 /sec
50,018,421,148 cycles:u # 4.117 GHz
220,502 stalled-cycles-frontend:u # 0.00% frontend
cycles idle
39,950,154,369 stalled-cycles-backend:u # 79.87% backend
cycles idle
120,000,191,713 instructions:u # 2.40 insn per
cycle
# 0.33 stalled cycles per
insn
10,000,048,918 branches:u # 823.182 M/sec
7,959 branch-misses:u # 0.00% of all
branches
12.149466078 seconds time elapsed
12.149084000 seconds user
0.000000000 seconds sys
jan@localhost:/tmp> gcc t.c q.c -march=native -O3 -g -DOPACITY ; objdump -d
a.out | grep vfmadd231pd ; perf stat ./a.out
Performance counter stats for './a.out':
12,141.11 msec task-clock:u # 1.000 CPUs
utilized
0 context-switches:u # 0.000 /sec
0 cpu-migrations:u # 0.000 /sec
735 page-faults:u # 60.538 /sec
50,018,839,129 cycles:u # 4.120 GHz
185,034 stalled-cycles-frontend:u # 0.00% frontend
cycles idle
29,963,999,798 stalled-cycles-backend:u # 59.91% backend
cycles idle
120,000,191,729 instructions:u # 2.40 insn per
cycle
# 0.25 stalled cycles per
insn
10,000,048,913 branches:u # 823.652 M/sec
7,311 branch-misses:u # 0.00% of all
branches
12.142252354 seconds time elapsed
12.138237000 seconds user
0.004000000 seconds sys
So on zen2 hardware I get same performance on both. It may be interesting to
test it on Raptor Lake.