https://gcc.gnu.org/bugzilla/show_bug.cgi?id=88760
--- Comment #22 from ktkachov at gcc dot gnu.org ---
Some more experiments...
Unrolling 4x in a similar way to my previous example and not splitting the
accumulator (separate issue):
unsigned int *colnums;
double *val;
struct foostruct
{
unsigned int rows;
unsigned int *colnums;
unsigned int *rowstart;
};
struct foostruct *cols;
void
foo (double * __restrict__ dst, const double *__restrict__ src)
{
const unsigned int n_rows = cols->rows;
const double *val_ptr = &val[cols->rowstart[0]];
const unsigned int *colnum_ptr = &cols->colnums[cols->rowstart[0]];
double *dst_ptr = dst;
for (unsigned int row=0; row<n_rows; ++row)
{
double s = 0.;
const double *const val_end_of_row = &val[cols->rowstart[row+1]];
__PTRDIFF_TYPE__ diff = val_end_of_row - val_ptr;
if (diff & 1)
{
s += *val_ptr++ * src[*colnum_ptr++];
diff--;
}
if (diff & 2)
{
s += val_ptr[0] * src[colnum_ptr[0]];
s += val_ptr[1] * src[colnum_ptr[1]];
val_ptr += 2;
colnum_ptr += 2;
}
while (val_ptr != val_end_of_row)
{
s += val_ptr[0] * src[colnum_ptr[0]];
s += val_ptr[1] * src[colnum_ptr[1]];
s += val_ptr[2] * src[colnum_ptr[2]];
s += val_ptr[3] * src[colnum_ptr[3]];
val_ptr += 4;
colnum_ptr += 4;
}
*dst_ptr++ = s;
}
}
helps even more. On Cortex-A72 it gives a bit more than 6% (vs 3%) improvement
on parest, and about 5.3% on a more aggressive CPU.
I tried unrolling 8x in a similar manner and that was not faster than 4x on
either target.
Note that perf profiling shows that the loads are what's hot in these loops,
not the FMAs themselves:
4.41 │1b8: ldp w3, w4, [x0]
▒
5.85 │ ldp d3, d4, [x2]
▒
│ add x2, x2, #0x20
▒
3.79 │ ldur d5, [x2, #-16]
▒
2.82 │ ldr d0, [x1, x4, lsl #3]
▒
2.53 │ ldr d2, [x1, x3, lsl #3]
▒
2.10 │ ldp w4, w3, [x0, #8]
▒
│ add x0, x0, #0x10
▒
0.00 │ cmp x5, x0
▒
│ fmul d0, d0, d4
▒
4.73 │ ldr d4, [x1, x4, lsl #3]
▒
│ fmadd d0, d3, d2, d0
▒
2.01 │ ldur d3, [x2, #-8]
▒
2.54 │ ldr d2, [x1, x3, lsl #3]
▒
│ fmadd d0, d5, d4, d0
▒
│ fmadd d0, d3, d2, d0
▒
│ fadd d1, d1, d0
