https://gcc.gnu.org/bugzilla/show_bug.cgi?id=112401
--- Comment #3 from JuzheZhong <juzhe.zhong at rivai dot ai> --- vfloat32m4_t matrix_4x4_transpose_vslide(vfloat32m4_t src) { vfloat32m1_t inMat0 = __riscv_vget_v_f32m4_f32m1(src, 0); vfloat32m1_t inMat1 = __riscv_vget_v_f32m4_f32m1(src, 1); vfloat32m1_t inMat2 = __riscv_vget_v_f32m4_f32m1(src, 2); vfloat32m1_t inMat3 = __riscv_vget_v_f32m4_f32m1(src, 3); vuint32m1_t oddMask_u32 = __riscv_vmv_v_x_u32m1(0xaaaa, 1); vuint32m1_t evenMask_u32 = __riscv_vmv_v_x_u32m1(0x5555, 1); vbool32_t oddMask = __riscv_vreinterpret_v_u32m1_b32(oddMask_u32); // vl=4 in the following // should be mapped to vslideup.vi vfloat32m1_t transMat0 = __riscv_vslideup_vx_f32m1_tumu(oddMask, inMat0, inMat1, 1, 4); vfloat32m1_t transMat2 = __riscv_vslideup_vx_f32m1_tumu(oddMask, inMat2, inMat3, 1, 4); vbool32_t evenMask = __riscv_vreinterpret_v_u32m1_b32(evenMask_u32); // should be mapped to vslidedown.vi vfloat32m1_t transMat1 = __riscv_vslidedown_vx_f32m1_tumu(evenMask, inMat1, inMat0, 1, 4); vfloat32m1_t transMat3 = __riscv_vslidedown_vx_f32m1_tumu(evenMask, inMat3, inMat2, 1, 4); // should be mapped to vslideup.vi vfloat32m1_t outMat0 = __riscv_vslideup_vx_f32m1_tu(transMat0, transMat2, 2, 4); vfloat32m1_t outMat1 = __riscv_vslideup_vx_f32m1_tu(transMat1, transMat3, 2, 4); // vl=2 in the following // should be mapped to vslidedown.vi vfloat32m1_t outMat2 = __riscv_vslidedown_vx_f32m1_tu(transMat2, transMat0, 2, 2); vfloat32m1_t outMat3 = __riscv_vslidedown_vx_f32m1_tu(transMat3, transMat1, 2, 2); return __riscv_vcreate_v_f32m1_f32m4(outMat0, outMat1, outMat2, outMat3); } matrix_4x4_transpose_vslide: li a4,45056 addiw a4,a4,-1366 vsetivli zero,1,e32,m1,ta,ma li a5,20480 vmv.v.x v0,a4 vsetivli zero,4,e32,m1,tu,mu vl4re32.v v4,0(a1) addiw a5,a5,1365 vmv1r.v v12,v4 vmv1r.v v3,v6 vslideup.vi v12,v5,1,v0.t vslideup.vi v3,v7,1,v0.t vsetivli zero,1,e32,m1,ta,ma vmv1r.v v1,v12 vmv.v.x v0,a5 vsetivli zero,4,e32,m1,tu,mu vslideup.vi v1,v3,2 vmv1r.v v2,v5 vmv1r.v v8,v1 vslidedown.vi v2,v4,1,v0.t vmv1r.v v1,v7 vmv1r.v v4,v2 vslidedown.vi v1,v6,1,v0.t vslideup.vi v4,v1,2 vsetivli zero,2,e32,m1,tu,ma vmv1r.v v9,v4 vslidedown.vi v3,v12,2 vslidedown.vi v1,v2,2 vmv1r.v v10,v3 vmv1r.v v11,v1 vs4r.v v8,0(a0) ret