https://gcc.gnu.org/bugzilla/show_bug.cgi?id=117031

--- Comment #5 from Richard Biener <rguenth at gcc dot gnu.org> ---
For example

void
test3 (unsigned short *x, double *y, int n)
{
    for (int i = 0; i < n; i+=1)
        {
            unsigned short a1 = x[i * 4 + 0];
            unsigned short b1 = x[i * 4 + 1];
            unsigned short c1 = x[i * 4 + 2];
            unsigned short d1 = x[i * 4 + 3];
            y[i+0] = (double)(a1 + b1 + c1 + d1);
        }
}

ends up

.L4:
        ld4     {v28.8h - v31.8h}, [x4], 64
        add     x3, x3, 64
        uaddl2  v26.4s, v28.8h, v29.8h
        uaddl   v28.4s, v28.4h, v29.4h
        uaddw2  v0.4s, v26.4s, v30.8h
        uaddw   v28.4s, v28.4s, v30.4h
        uaddw2  v0.4s, v0.4s, v31.8h
        uaddw   v28.4s, v28.4s, v31.4h
        sxtl    v1.2d, v0.2s
        sxtl    v27.2d, v28.2s
        sxtl2   v0.2d, v0.4s
        sxtl2   v28.2d, v28.4s
        scvtf   v1.2d, v1.2d
        scvtf   v27.2d, v27.2d
        scvtf   v0.2d, v0.2d
        scvtf   v28.2d, v28.2d
        stp     q1, q0, [x3, -32]
        stp     q27, q28, [x3, -64]
        cmp     x5, x4
        bne     .L4

we can now use widening plus and avoid the HI -> DF conversion penalty.
You can see that with ld4 there's no permutes needed at all.

Reply via email to