https://gcc.gnu.org/bugzilla/show_bug.cgi?id=116590
--- Comment #4 from nihui <shuizhuyuanluo at gmail dot com> --- Some more detailed investigation vfrec7 I can use vfdiv instead, which works, but the vmv1r vmv8r instructions are hard to avoid in the code, as they seem to be added automatically by the compiler I wrote 3 test cases, 1 of which compiles, 1 of which breaks down with O0 optimization, and 1 of which breaks down with O2/O3 optimization The code that breaks down is just inlining the _a variable or changing its scope // all pass void gen_vmv8r(float* ptr, int n) { while (n > 0) { size_t vl = __riscv_vsetvl_e32m8(n); vfloat32m8_t _p = __riscv_vle32_v_f32m8(ptr, vl); vfloat32m8_t _a = __riscv_vfmv_v_f_f32m8(0.5f, vl); _p = __riscv_vfmacc_vf_f32m8(_a, 1.4f, _p, vl); __riscv_vse32_v_f32m8(ptr, _p, vl); ptr += vl; n -= vl; } } // -O0 gen vmv8r // -O1 pass // -O2/-O3 pass void gen_vmv8r(float* ptr, int n) { while (n > 0) { size_t vl = __riscv_vsetvl_e32m8(n); vfloat32m8_t _p = __riscv_vle32_v_f32m8(ptr, vl); _p = __riscv_vfmacc_vf_f32m8(__riscv_vfmv_v_f_f32m8(0.5f, vl), 1.4f, _p, vl); __riscv_vse32_v_f32m8(ptr, _p, vl); ptr += vl; n -= vl; } } // -O0 pass // -O1 gen vmv8r // -O2/-O3 gen two vmv8r void no_vmv8r(float* ptr, int n) { size_t vl0 = __riscv_vsetvl_e32m8(n); vfloat32m8_t _a = __riscv_vfmv_v_f_f32m8(0.5f, vl0); while (n > 0) { size_t vl = __riscv_vsetvl_e32m8(n); vfloat32m8_t _p = __riscv_vle32_v_f32m8(ptr, vl); _p = __riscv_vfmacc_vf_f32m8(_a, 1.4f, _p, vl); __riscv_vse32_v_f32m8(ptr, _p, vl); ptr += vl; n -= vl; } }