https://gcc.gnu.org/bugzilla/show_bug.cgi?id=122219

--- Comment #20 from Pengfei Li <pfustc at gcc dot gnu.org> ---
A vector version of above case is:

typedef float v4sf __attribute__((vector_size(16)));
typedef float v8sf __attribute__((vector_size(32)));

union TYPE {
  v8sf m;
  v4sf n[2];
};

inline v8sf convert(v4sf n) {
  TYPE v;
  v.n[0] = n;
  return v.m;
}

v8sf foo(TYPE x, unsigned n) {
  v8sf v;
  for (int i = 0; i < n; i++) {
    x.n[0] = x.n[0] * x.n[0];
    v = convert(x.n[0]);
  }
  return v;
}

GCC produces:

.L3:
        fmul    v31.4s, v31.4s, v31.4s
        add     w0, w0, 1
        ldr     q30, [sp, 16]
        cmp     w1, w0
        bne     .L3

The load doesn't need to be in the loop.

Reply via email to