https://gcc.gnu.org/bugzilla/show_bug.cgi?id=122901

            Bug ID: 122901
           Summary: gcc is not able to handle pair of vectors coming from
                    __builtin_convertvector
           Product: gcc
           Version: 15.2.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: middle-end
          Assignee: unassigned at gcc dot gnu.org
          Reporter: [email protected]
  Target Milestone: ---

Function below takes array of int16_t values and multiples each of them by some
factor from range [0, 1]. It is optimized for integer arithmetic, so it takes
factor as value from range [0, 1024], multiples by it and divides by 1024. To
avoid integer overflow, it has to extend all values to int32 before
multiplication, and reduce result back to int16 at the end. It uses
__builtin_convertvector to do both conversions:

[code]
#include <stdint.h>

void f(int16_t *buf, unsigned int size, uint32_t k)
{
    typedef int16_t v16x8 __attribute__ ((vector_size (16)));
    typedef int32_t v32x8 __attribute__ ((vector_size (32)));

    if (k > 1024)
        __builtin_unreachable();

    for (unsigned int n = 0; n < size; n += 8) {
        v16x8 v1 = *(v16x8*)(buf + n);
        v32x8 v2 = __builtin_convertvector(v1, v32x8);
        v2 *= int(k);
        v2 /= 1024;
        v1 = __builtin_convertvector(v2, v16x8);
        *(v16x8*)(buf + n) = v1;
    }
}
[/code]

clang 21.1.0 for ARM64 with -O3 gives nice output:

[code]
f(short*, unsigned int, unsigned int):
        cbz     w1, .LBB0_3
        fmov    s0, w2
        mov     x8, xzr
        mov     w9, w1
.LBB0_2:
        ldr     q1, [x0]
        add     x8, x8, #8
        cmp     x8, x9
        sshll   v2.4s, v1.4h, #0
        sshll2  v1.4s, v1.8h, #0
        mul     v2.4s, v2.4s, v0.s[0]
        mul     v1.4s, v1.4s, v0.s[0]
        cmlt    v3.4s, v2.4s, #0
        cmlt    v4.4s, v1.4s, #0
        usra    v2.4s, v3.4s, #22
        usra    v1.4s, v4.4s, #22
        shrn    v2.4h, v2.4s, #10
        shrn2   v2.8h, v1.4s, #10
        str     q2, [x0], #16
        b.lo    .LBB0_2
.LBB0_3:
        ret
[/code]

gcc 15.2.0 for ARM64 with -O3 failed to vectorize it:

[code]
f(short*, unsigned int, unsigned int):
        cbz     w1, .L7
        sub     sp, sp, #64
        mov     x6, 0
        mov     x5, 0
        mov     x4, 0
        mov     x3, 0
        bfi     x6, x2, 0, 32
        bfi     x5, x2, 0, 32
        bfi     x4, x2, 0, 32
        bfi     x3, x2, 0, 32
        bfi     x6, x2, 32, 32
        bfi     x5, x2, 32, 32
        bfi     x4, x2, 32, 32
        bfi     x3, x2, 32, 32
        mov     w10, 0
        stp     x6, x5, [sp, 32]
        stp     x4, x3, [sp, 48]
.L3:
        ubfiz   x14, x10, 1, 32
        ldr     q29, [sp, 32]
        mov     x15, 0
        ldr     q31, [x0, x14]
        mov     x13, 0
        mov     x12, 0
        mov     x11, 0
        add     w10, w10, 8
        sxtl    v30.4s, v31.4h
        sxtl2   v31.4s, v31.8h
        mul     v30.4s, v30.4s, v29.4s
        ldr     q29, [sp, 48]
        umov    w5, v30.s[1]
        fmov    w2, s30
        mul     v31.4s, v31.4s, v29.4s
        umov    w4, v30.s[2]
        umov    w3, v30.s[3]
        cmp     w2, 0
        add     w9, w2, 1023
        fmov    w6, s31
        csel    w9, w9, w2, lt
        cmp     w5, 0
        add     w8, w4, 1023
        umov    w17, v31.s[1]
        umov    w2, v31.s[2]
        umov    w16, v31.s[3]
        asr     w9, w9, 10
        add     w7, w6, 1023
        add     w6, w5, 1023
        csel    w6, w6, w5, lt
        cmp     w4, 0
        csel    w8, w8, w4, lt
        add     w5, w3, 1023
        cmp     w3, 0
        add     w4, w2, 1023
        csel    w5, w5, w3, lt
        fmov    w3, s31
        asr     w8, w8, 10
        bfi     x15, x9, 0, 32
        asr     w6, w6, 10
        asr     w5, w5, 10
        bfi     x13, x8, 0, 32
        cmp     w3, 0
        bfi     x15, x6, 32, 32
        csel    w7, w7, w3, lt
        cmp     w17, 0
        add     w3, w17, 1023
        bfi     x13, x5, 32, 32
        csel    w3, w3, w17, lt
        cmp     w2, 0
        csel    w4, w4, w2, lt
        cmp     w16, 0
        add     w2, w16, 1023
        asr     w7, w7, 10
        asr     w4, w4, 10
        csel    w2, w2, w16, lt
        bfi     x12, x7, 0, 32
        asr     w3, w3, 10
        bfi     x11, x4, 0, 32
        asr     w2, w2, 10
        bfi     x12, x3, 32, 32
        stp     x15, x13, [sp]
        bfi     x11, x2, 32, 32
        stp     x12, x11, [sp, 16]
        ldp     q31, q30, [sp]
        uzp1    v31.8h, v31.8h, v30.8h
        str     q31, [x0, x14]
        cmp     w1, w10
        bhi     .L3
        add     sp, sp, 64
        ret
.L7:
        ret
[/code]

For comparison, clang 21.1.0 for x86_64 with -O3 -msse4 gives this:

[code]
f(short*, unsigned int, unsigned int):
        test    esi, esi
        je      .LBB0_3
        movd    xmm0, edx
        pshufd  xmm0, xmm0, 0
        mov     eax, esi
        xor     ecx, ecx
        pxor    xmm1, xmm1
.LBB0_2:
        pmovsxwd        xmm2, qword ptr [rdi + 2*rcx]
        pmovsxwd        xmm3, qword ptr [rdi + 2*rcx + 8]
        pmulld  xmm3, xmm0
        pmulld  xmm2, xmm0
        movdqa  xmm4, xmm2
        psrad   xmm4, 31
        psrld   xmm4, 22
        paddd   xmm4, xmm2
        psrld   xmm4, 10
        movdqa  xmm2, xmm3
        psrad   xmm2, 31
        psrld   xmm2, 22
        paddd   xmm2, xmm3
        psrld   xmm2, 10
        pblendw xmm2, xmm1, 170
        pblendw xmm4, xmm1, 170
        packusdw        xmm4, xmm2
        movdqa  xmmword ptr [rdi + 2*rcx], xmm4
        add     rcx, 8
        cmp     rcx, rax
        jb      .LBB0_2
.LBB0_3:
        ret
[/code]

gcc 15.2.0 for x86_64 with -O3 -msse4 produced code which looks like mixture of
regular and vectorized code:

[code]
f(short*, unsigned int, unsigned int):
        test    esi, esi
        je      .L7
        push    rbp
        xor     ecx, ecx
        pxor    xmm2, xmm2
        mov     rbp, rsp
        and     rsp, -32
        mov     DWORD PTR [rsp-96], edx
        mov     DWORD PTR [rsp-92], edx
        mov     DWORD PTR [rsp-88], edx
        mov     DWORD PTR [rsp-84], edx
        movdqa  xmm0, XMMWORD PTR [rsp-96]
        mov     DWORD PTR [rsp-80], edx
        mov     DWORD PTR [rsp-76], edx
        mov     DWORD PTR [rsp-72], edx
        mov     DWORD PTR [rsp-68], edx
        movaps  XMMWORD PTR [rsp-32], xmm0
        movdqa  xmm0, XMMWORD PTR [rsp-80]
        movaps  XMMWORD PTR [rsp-16], xmm0
.L3:
        mov     eax, ecx
        lea     r8, [rdi+rax*2]
        movdqa  xmm0, XMMWORD PTR [r8]
        pmovsxwd        xmm1, xmm0
        pmulld  xmm1, XMMWORD PTR [rsp-32]
        movd    edx, xmm1
        test    edx, edx
        psrldq  xmm0, 8
        lea     eax, [rdx+1023]
        cmovns  eax, edx
        pextrd  edx, xmm1, 1
        pmovsxwd        xmm0, xmm0
        pmulld  xmm0, XMMWORD PTR [rsp-16]
        sar     eax, 10
        test    edx, edx
        mov     DWORD PTR [rsp-96], eax
        lea     eax, [rdx+1023]
        cmovns  eax, edx
        pextrd  edx, xmm1, 2
        sar     eax, 10
        test    edx, edx
        mov     DWORD PTR [rsp-92], eax
        lea     eax, [rdx+1023]
        cmovns  eax, edx
        sar     eax, 10
        mov     DWORD PTR [rsp-88], eax
        pextrd  eax, xmm1, 3
        test    eax, eax
        lea     edx, [rax+1023]
        cmovs   eax, edx
        movd    edx, xmm0
        sar     eax, 10
        test    edx, edx
        mov     DWORD PTR [rsp-84], eax
        lea     eax, [rdx+1023]
        cmovns  eax, edx
        pextrd  edx, xmm0, 1
        sar     eax, 10
        test    edx, edx
        mov     DWORD PTR [rsp-80], eax
        lea     eax, [rdx+1023]
        cmovns  eax, edx
        pextrd  edx, xmm0, 2
        sar     eax, 10
        test    edx, edx
        mov     DWORD PTR [rsp-76], eax
        lea     eax, [rdx+1023]
        cmovns  eax, edx
        sar     eax, 10
        mov     DWORD PTR [rsp-72], eax
        pextrd  eax, xmm0, 3
        movdqa  xmm0, XMMWORD PTR [rsp-96]
        test    eax, eax
        lea     edx, [rax+1023]
        cmovs   eax, edx
        movaps  XMMWORD PTR [rsp-64], xmm0
        pblendw xmm0, xmm2, 170
        add     ecx, 8
        sar     eax, 10
        mov     DWORD PTR [rsp-68], eax
        movdqa  xmm1, XMMWORD PTR [rsp-80]
        movaps  XMMWORD PTR [rsp-48], xmm1
        pblendw xmm1, xmm2, 170
        packusdw        xmm0, xmm1
        movaps  XMMWORD PTR [r8], xmm0
        cmp     ecx, esi
        jb      .L3
        leave
        ret
.L7:
        ret
[/code]

As a workaround I had to use helper union and do all operations on shorter
vectors. For ARM64 output from clang and gcc was similar. For x86_64 output
from gcc was vectorized, but longer than from clang.

[code]
void f2(int16_t *buf, unsigned int size, uint32_t k)
{
    typedef int16_t v16x8 __attribute__ ((vector_size (16)));
    typedef int32_t v32x8 __attribute__ ((vector_size (32)));
    typedef int32_t v32x4 __attribute__ ((vector_size (16)));

    union u {
        v32x8 v1;
        v32x4 v2[2];
    };

    if (k > 1024)
        __builtin_unreachable();

    for (unsigned int n = 0; n < size; n += 8) {
        v16x8 v1 = *(v16x8*)(buf + n);
        u u;
        u.v1 = __builtin_convertvector(v1, v32x8);
        u.v2[0] *= int(k);
        u.v2[1] *= int(k);
        u.v2[0] /= 1024;
        u.v2[1] /= 1024;
        v1 = __builtin_convertvector(u.v1, v16x8);
        *(v16x8*)(buf + n) = v1;
    }
}
[/code]

Reply via email to