https://gcc.gnu.org/bugzilla/show_bug.cgi?id=112438

--- Comment #5 from Kito Cheng <kito at gcc dot gnu.org> ---
Assume:

VLEN = 128 and n = 5, *in is {0, 0, 0, 0, 0}
so VLMAX = 4 for e32m1

It can be run with vl = 4 for first iteration, and vl = 1 vl for second
iteration

But it could be something like that: vl = 3 for first iteration and vl = 2 for
second iteration, ok, let run the code with that:

foo(int, int*, int*):
        ble     a0,zero,.L5
        csrr    a5,vlenb
        srli    a5,a5,2
        vsetvli a3,zero,e32,m1,ta,ma
        vmv.v.x v4,a5                 # v4 = {4, 4, 4, 4}
        vid.v   v2                    # v2 = {0, 1, 2, 3}
.L3:
        vsetvli a5,a0,e32,m1,ta,ma    # first iteration got vl = 3
        slli    a4,a5,2
        vle32.v v1,0(a1)              # v1 = {0, 0, 0}
        sub     a0,a0,a5
        vadd.vv v1,v1,v2              # v1 = {0, 0, 0} + {0, 1, 2}
        vse32.v v1,0(a2)              # out = {0, 1, 2, 0, 0}
        add     a1,a1,a4
        vsetvli a5,zero,e32,m1,ta,ma
        add     a2,a2,a4
        vadd.vv v2,v2,v4              # v2 = {0, 1, 2, 3} + {4, 4, 4, 4}
                                      #    = {4, 5, 6, 7}
        bne     a0,zero,.L3
.L5:
        ret

Ok, let run second iteration:

.L3:
        vsetvli a5,a0,e32,m1,ta,ma    # first iteration got vl = 2
        slli    a4,a5,2
        vle32.v v1,0(a1)              # v1 = {0, 0}
        sub     a0,a0,a5
        vadd.vv v1,v1,v2              # v1 = {0, 0} + {4, 5}
        vse32.v v1,0(a2)              # out = {0, 1, 2, 4, 5}
        add     a1,a1,a4
        vsetvli a5,zero,e32,m1,ta,ma
        add     a2,a2,a4
        vadd.vv v2,v2,v4              # v2 = {4, 5, 6, 7} + {4, 4, 4, 4}
                                      #    = {8, 9, 10, 11}
        bne     a0,zero,.L3

And the you will got {0, 1, 2, 4, 5} rather than {0, 1, 2, 3, 4}

Reply via email to