https://gcc.gnu.org/bugzilla/show_bug.cgi?id=122219
--- Comment #20 from Pengfei Li <pfustc at gcc dot gnu.org> ---
A vector version of above case is:
typedef float v4sf __attribute__((vector_size(16)));
typedef float v8sf __attribute__((vector_size(32)));
union TYPE {
v8sf m;
v4sf n[2];
};
inline v8sf convert(v4sf n) {
TYPE v;
v.n[0] = n;
return v.m;
}
v8sf foo(TYPE x, unsigned n) {
v8sf v;
for (int i = 0; i < n; i++) {
x.n[0] = x.n[0] * x.n[0];
v = convert(x.n[0]);
}
return v;
}
GCC produces:
.L3:
fmul v31.4s, v31.4s, v31.4s
add w0, w0, 1
ldr q30, [sp, 16]
cmp w1, w0
bne .L3
The load doesn't need to be in the loop.