https://gcc.gnu.org/bugzilla/show_bug.cgi?id=112401

--- Comment #3 from JuzheZhong <juzhe.zhong at rivai dot ai> ---
vfloat32m4_t matrix_4x4_transpose_vslide(vfloat32m4_t src) {
    vfloat32m1_t inMat0 = __riscv_vget_v_f32m4_f32m1(src, 0);
    vfloat32m1_t inMat1 = __riscv_vget_v_f32m4_f32m1(src, 1);
    vfloat32m1_t inMat2 = __riscv_vget_v_f32m4_f32m1(src, 2);
    vfloat32m1_t inMat3 = __riscv_vget_v_f32m4_f32m1(src, 3);
vuint32m1_t oddMask_u32 = __riscv_vmv_v_x_u32m1(0xaaaa, 1);
vuint32m1_t evenMask_u32 = __riscv_vmv_v_x_u32m1(0x5555, 1);

vbool32_t oddMask = __riscv_vreinterpret_v_u32m1_b32(oddMask_u32);
// vl=4 in the following
// should be mapped to vslideup.vi
vfloat32m1_t transMat0 = __riscv_vslideup_vx_f32m1_tumu(oddMask,
                                                        inMat0,
                                                        inMat1,
                                                        1, 4);
vfloat32m1_t transMat2 = __riscv_vslideup_vx_f32m1_tumu(oddMask,
                                                        inMat2,
                                                        inMat3,
                                                         1, 4);

vbool32_t evenMask = __riscv_vreinterpret_v_u32m1_b32(evenMask_u32);
// should be mapped to vslidedown.vi
vfloat32m1_t transMat1 = __riscv_vslidedown_vx_f32m1_tumu(evenMask,
                                                          inMat1,
                                                          inMat0,
                                                          1, 4);
vfloat32m1_t transMat3 = __riscv_vslidedown_vx_f32m1_tumu(evenMask,
                                                          inMat3,
                                                          inMat2,
                                                          1, 4);

// should be mapped to vslideup.vi
vfloat32m1_t outMat0 = __riscv_vslideup_vx_f32m1_tu(transMat0,
                                                    transMat2,
                                                    2, 4);
vfloat32m1_t outMat1 = __riscv_vslideup_vx_f32m1_tu(transMat1,
                                                    transMat3,
                                                    2, 4);

// vl=2 in the following
// should be mapped to vslidedown.vi
vfloat32m1_t outMat2 = __riscv_vslidedown_vx_f32m1_tu(transMat2,
                                                      transMat0,
                                                      2, 2);
vfloat32m1_t outMat3 = __riscv_vslidedown_vx_f32m1_tu(transMat3,
                                                      transMat1,
                                                      2, 2);

return __riscv_vcreate_v_f32m1_f32m4(outMat0,
                                     outMat1,
                                     outMat2,
                                     outMat3);




}


matrix_4x4_transpose_vslide:
        li      a4,45056
        addiw   a4,a4,-1366
        vsetivli        zero,1,e32,m1,ta,ma
        li      a5,20480
        vmv.v.x v0,a4
        vsetivli        zero,4,e32,m1,tu,mu
        vl4re32.v       v4,0(a1)
        addiw   a5,a5,1365
        vmv1r.v v12,v4
        vmv1r.v v3,v6
        vslideup.vi     v12,v5,1,v0.t
        vslideup.vi     v3,v7,1,v0.t
        vsetivli        zero,1,e32,m1,ta,ma
        vmv1r.v v1,v12
        vmv.v.x v0,a5
        vsetivli        zero,4,e32,m1,tu,mu
        vslideup.vi     v1,v3,2
        vmv1r.v v2,v5
        vmv1r.v v8,v1
        vslidedown.vi   v2,v4,1,v0.t
        vmv1r.v v1,v7
        vmv1r.v v4,v2
        vslidedown.vi   v1,v6,1,v0.t
        vslideup.vi     v4,v1,2
        vsetivli        zero,2,e32,m1,tu,ma
        vmv1r.v v9,v4
        vslidedown.vi   v3,v12,2
        vslidedown.vi   v1,v2,2
        vmv1r.v v10,v3
        vmv1r.v v11,v1
        vs4r.v  v8,0(a0)
        ret

Reply via email to