https://gcc.gnu.org/bugzilla/show_bug.cgi?id=112401
--- Comment #2 from JuzheZhong <juzhe.zhong at rivai dot ai> ---
Add more test:
void matrix_4x4_transpose_segmented_load(float* dst, float* src)
{
vfloat32m1x4_t data = __riscv_vlseg4e32_v_f32m1x4(src, 4);
vfloat32m1_t data0 = __riscv_vget_v_f32m1x4_f32m1(data, 0);
vfloat32m1_t data1 = __riscv_vget_v_f32m1x4_f32m1(data, 1);
vfloat32m1_t data2 = __riscv_vget_v_f32m1x4_f32m1(data, 2);
vfloat32m1_t data3 = __riscv_vget_v_f32m1x4_f32m1(data, 3);
vfloat32m4_t packedData = __riscv_vcreate_v_f32m1_f32m4(data0,
data1,
data2,
data3);
__riscv_vse32_v_f32m4(dst, packedData, 16);
}
matrix_4x4_transpose_segmented_load:
vsetivli zero,4,e32,m1,ta,ma
vlseg4e32.v v8,(a1)
vsetivli zero,16,e32,m4,ta,ma
vmv1r.v v4,v8
vmv1r.v v5,v9
vmv1r.v v6,v10
vmv1r.v v7,v11
vse32.v v4,0(a0)
ret