https://gcc.gnu.org/bugzilla/show_bug.cgi?id=116590

--- Comment #4 from nihui <shuizhuyuanluo at gmail dot com> ---
Some more detailed investigation

vfrec7 I can use vfdiv instead, which works, but the vmv1r vmv8r instructions
are hard to avoid in the code, as they seem to be added automatically by the
compiler

I wrote 3 test cases, 1 of which compiles, 1 of which breaks down with O0
optimization, and 1 of which breaks down with O2/O3 optimization

The code that breaks down is just inlining the _a variable or changing its
scope


// all pass
void gen_vmv8r(float* ptr, int n)
{
    while (n > 0)
    {
        size_t vl = __riscv_vsetvl_e32m8(n);

        vfloat32m8_t _p = __riscv_vle32_v_f32m8(ptr, vl);

        vfloat32m8_t _a = __riscv_vfmv_v_f_f32m8(0.5f, vl);

        _p = __riscv_vfmacc_vf_f32m8(_a, 1.4f, _p, vl);

        __riscv_vse32_v_f32m8(ptr, _p, vl);

        ptr += vl;
        n -= vl;
    }
}

// -O0 gen vmv8r
// -O1 pass
// -O2/-O3 pass
void gen_vmv8r(float* ptr, int n)
{
    while (n > 0)
    {
        size_t vl = __riscv_vsetvl_e32m8(n);

        vfloat32m8_t _p = __riscv_vle32_v_f32m8(ptr, vl);

        _p = __riscv_vfmacc_vf_f32m8(__riscv_vfmv_v_f_f32m8(0.5f, vl), 1.4f,
_p, vl);

        __riscv_vse32_v_f32m8(ptr, _p, vl);

        ptr += vl;
        n -= vl;
    }
}

// -O0 pass
// -O1 gen vmv8r
// -O2/-O3 gen two vmv8r
void no_vmv8r(float* ptr, int n)
{
    size_t vl0 = __riscv_vsetvl_e32m8(n);
    vfloat32m8_t _a = __riscv_vfmv_v_f_f32m8(0.5f, vl0);

    while (n > 0)
    {
        size_t vl = __riscv_vsetvl_e32m8(n);

        vfloat32m8_t _p = __riscv_vle32_v_f32m8(ptr, vl);

        _p = __riscv_vfmacc_vf_f32m8(_a, 1.4f, _p, vl);

        __riscv_vse32_v_f32m8(ptr, _p, vl);

        ptr += vl;
        n -= vl;
    }
}

Reply via email to