https://gcc.gnu.org/bugzilla/show_bug.cgi?id=110559

            Bug ID: 110559
           Summary: Bad mask_load/mask_store codegen of RVV
           Product: gcc
           Version: 14.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: c
          Assignee: unassigned at gcc dot gnu.org
          Reporter: juzhe.zhong at rivai dot ai
  Target Milestone: ---

#include <stdint-gcc.h>

#define INDEX8 uint8_t
#define INDEX16 uint16_t
#define INDEX32 uint32_t
#define INDEX64 uint64_t

#define TEST_LOOP(DATA_TYPE, BITS)                                            
\
  void __attribute__ ((noinline, noclone))                                    
\
  f_##DATA_TYPE (DATA_TYPE *restrict dest, DATA_TYPE *restrict src,           
\
                 INDEX##BITS *restrict indices, INDEX##BITS *restrict cond)   
\
  {                                                                           
\
    for (int i = 0; i < 128; ++i)                                             
\
      if (cond[i])                                                            
\
        dest[i] += src[i];                                            \
  }

#define TEST_ALL(T)                                                           
\
  T (int8_t, 8)                                                               
\
  T (uint8_t, 8)                                                              
\
  T (int16_t, 16)                                                             
\
  T (uint16_t, 16)                                                            
\
  T (_Float16, 16)                                                            
\
  T (int32_t, 32)                                                             
\
  T (uint32_t, 32)                                                            
\
  T (float, 32)                                                               
\
  T (int64_t, 64)                                                             
\
  T (uint64_t, 64)                                                            
\
  T (double, 64)

TEST_ALL (TEST_LOOP)

riscv32: --param riscv-autovec-preference=fixed-vlmax -O3:
f_int8_t:
        addi    sp,sp,-48
        sw      s0,44(sp)
        sw      s1,40(sp)
        sw      s2,36(sp)
        sw      s3,32(sp)
        sw      s4,28(sp)
        sw      s5,24(sp)
        sw      s6,20(sp)
        sw      s7,16(sp)
        sw      s8,12(sp)
        sw      s9,8(sp)
        vsetivli        zero,16,e8,m1,ta,ma
        addi    s9,a3,16
        vmv.v.i v1,0
        vl1re8.v        v8,0(a3)
        vmsne.vv        v8,v8,v1
        vmv1r.v v0,v8
        vl1re8.v        v7,0(s9)
        vle8.v  v9,0(a0),v0.t
        vmsne.vv        v7,v7,v1
        vle8.v  v15,0(a1),v0.t
        addi    s8,a1,16
        vmv1r.v v0,v7
        addi    s7,a3,32
        vle8.v  v14,0(s8),v0.t
        vl1re8.v        v6,0(s7)
        addi    s5,a1,32
        vmsne.vv        v6,v6,v1
        addi    s6,a3,48
        vmv1r.v v0,v6
        vl1re8.v        v5,0(s6)
        vle8.v  v13,0(s5),v0.t
        vmsne.vv        v5,v5,v1
        addi    s4,a1,48
        vmv1r.v v0,v5
        addi    s3,a3,64
        vle8.v  v12,0(s4),v0.t
        vl1re8.v        v4,0(s3)
        addi    s1,a1,64
        vmsne.vv        v4,v4,v1
        addi    s2,a3,80
        vmv1r.v v0,v4
        addi    s0,a1,80
        addi    t2,a3,96
        vle8.v  v11,0(s1),v0.t
        vl1re8.v        v2,0(s2)
        vl1re8.v        v3,0(t2)
        vmsne.vv        v2,v2,v1
        vmsne.vv        v3,v3,v1
        vmv1r.v v0,v2
        vadd.vv v9,v9,v15
        vle8.v  v10,0(s0),v0.t
        vsetvli a5,zero,e8,m1,ta,ma
        vmv1r.v v0,v8
        addi    t4,a0,16
        vse8.v  v9,0(a0),v0.t
        vsetivli        zero,16,e8,m1,ta,ma
        vmv1r.v v0,v7
        vle8.v  v8,0(t4),v0.t
        vadd.vv v8,v8,v14
        vsetvli a5,zero,e8,m1,ta,ma
        addi    t3,a0,32
        vse8.v  v8,0(t4),v0.t
        vsetivli        zero,16,e8,m1,ta,ma
        vmv1r.v v0,v6
        vle8.v  v7,0(t3),v0.t
        vadd.vv v7,v7,v13
        vsetvli a5,zero,e8,m1,ta,ma
        addi    t1,a0,48
        vse8.v  v7,0(t3),v0.t
        vsetivli        zero,16,e8,m1,ta,ma
        vmv1r.v v0,v5
        vle8.v  v6,0(t1),v0.t
        vadd.vv v6,v6,v12
        vsetvli a5,zero,e8,m1,ta,ma
        addi    a7,a0,64
        vse8.v  v6,0(t1),v0.t
        vsetivli        zero,16,e8,m1,ta,ma
        vmv1r.v v0,v4
        vle8.v  v5,0(a7),v0.t
        vadd.vv v5,v5,v11
        vsetvli a5,zero,e8,m1,ta,ma
        addi    a6,a0,80
        vse8.v  v5,0(a7),v0.t
        vsetivli        zero,16,e8,m1,ta,ma
        vmv1r.v v0,v2
        vle8.v  v4,0(a6),v0.t
        vadd.vv v4,v4,v10
        vsetvli a5,zero,e8,m1,ta,ma
        addi    a2,a0,96
        vse8.v  v4,0(a6),v0.t
        addi    t0,a1,96
        vsetivli        zero,16,e8,m1,ta,ma
        addi    t6,a3,112
        vmv1r.v v0,v3
        vl1re8.v        v2,0(t6)
        vle8.v  v4,0(a2),v0.t
        vle8.v  v5,0(t0),v0.t
        vmsne.vv        v1,v2,v1
        vadd.vv v4,v4,v5
        vsetvli a5,zero,e8,m1,ta,ma
        addi    a4,a0,112
        vse8.v  v4,0(a2),v0.t
        addi    t5,a1,112
        vsetivli        zero,16,e8,m1,ta,ma
        vmv1r.v v0,v1
        vle8.v  v2,0(a4),v0.t
        vle8.v  v3,0(t5),v0.t
        vadd.vv v2,v2,v3
        vsetvli a5,zero,e8,m1,ta,ma
        vse8.v  v2,0(a4),v0.t
        lw      s0,44(sp)
        lw      s1,40(sp)
        lw      s2,36(sp)
        lw      s3,32(sp)
        lw      s4,28(sp)
        lw      s5,24(sp)
        lw      s6,20(sp)
        lw      s7,16(sp)
        lw      s8,12(sp)
        lw      s9,8(sp)
        addi    sp,sp,48
        jr      ra

This codegen is very ugly and bad, to many "lw" "sw" and "vmv1r.v"

But with -fno-schedule-insns, codegen becomes much more reasonable:

f_uint8_t:
        vsetivli        zero,16,e8,m1,ta,ma
        addi    a2,a3,16
        vmv.v.i v1,0
        vl1re8.v        v0,0(a3)
        vmsne.vv        v0,v0,v1
        vle8.v  v2,0(a0),v0.t
        vle8.v  v3,0(a1),v0.t
        vadd.vv v2,v2,v3
        vsetvli a5,zero,e8,m1,ta,ma
        addi    a4,a0,16
        vse8.v  v2,0(a0),v0.t
        vsetivli        zero,16,e8,m1,ta,ma
        vl1re8.v        v0,0(a2)
        addi    a2,a1,16
        vmsne.vv        v0,v0,v1
        vle8.v  v3,0(a4),v0.t
        vle8.v  v2,0(a2),v0.t
        addi    a2,a3,32
        vadd.vv v2,v2,v3
        vsetvli a5,zero,e8,m1,ta,ma
        vse8.v  v2,0(a4),v0.t
        vsetivli        zero,16,e8,m1,ta,ma
        addi    a4,a0,32
        vl1re8.v        v0,0(a2)
        addi    a2,a1,32
        vmsne.vv        v0,v0,v1
        vle8.v  v2,0(a4),v0.t
        vle8.v  v3,0(a2),v0.t
        addi    a2,a3,48
        vadd.vv v2,v2,v3
        vsetvli a5,zero,e8,m1,ta,ma
        vse8.v  v2,0(a4),v0.t
        vsetivli        zero,16,e8,m1,ta,ma
        addi    a4,a0,48
        vl1re8.v        v0,0(a2)
        addi    a2,a1,48
        vmsne.vv        v0,v0,v1
        vle8.v  v2,0(a4),v0.t
        vle8.v  v3,0(a2),v0.t
        addi    a2,a3,64
        vadd.vv v2,v2,v3
        vsetvli a5,zero,e8,m1,ta,ma
        vse8.v  v2,0(a4),v0.t
        vsetivli        zero,16,e8,m1,ta,ma
        addi    a4,a0,64
        vl1re8.v        v0,0(a2)
        addi    a2,a1,64
        vmsne.vv        v0,v0,v1
        vle8.v  v2,0(a4),v0.t
        vle8.v  v3,0(a2),v0.t
        addi    a2,a3,80
        vadd.vv v2,v2,v3
        vsetvli a5,zero,e8,m1,ta,ma
        vse8.v  v2,0(a4),v0.t
        vsetivli        zero,16,e8,m1,ta,ma
        addi    a4,a0,80
        vl1re8.v        v0,0(a2)
        addi    a2,a1,80
        vmsne.vv        v0,v0,v1
        vle8.v  v2,0(a4),v0.t
        vle8.v  v3,0(a2),v0.t
        addi    a2,a3,96
        vadd.vv v2,v2,v3
        vsetvli a5,zero,e8,m1,ta,ma
        vse8.v  v2,0(a4),v0.t
        vsetivli        zero,16,e8,m1,ta,ma
        addi    a4,a0,96
        vl1re8.v        v0,0(a2)
        addi    a2,a1,96
        vmsne.vv        v0,v0,v1
        vle8.v  v2,0(a4),v0.t
        vle8.v  v3,0(a2),v0.t
        vadd.vv v2,v2,v3
        vsetvli a5,zero,e8,m1,ta,ma
        addi    a0,a0,112
        vse8.v  v2,0(a4),v0.t
        addi    a3,a3,112
        vsetivli        zero,16,e8,m1,ta,ma
        addi    a1,a1,112
        vl1re8.v        v0,0(a3)
        vmsne.vv        v0,v0,v1
        vle8.v  v1,0(a0),v0.t
        vle8.v  v2,0(a1),v0.t
        vadd.vv v1,v1,v2
        vsetvli a5,zero,e8,m1,ta,ma
        vse8.v  v1,0(a0),v0.t
        ret

Reply via email to