https://gcc.gnu.org/bugzilla/show_bug.cgi?id=122901
Bug ID: 122901
Summary: gcc is not able to handle pair of vectors coming from
__builtin_convertvector
Product: gcc
Version: 15.2.0
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: middle-end
Assignee: unassigned at gcc dot gnu.org
Reporter: [email protected]
Target Milestone: ---
Function below takes array of int16_t values and multiples each of them by some
factor from range [0, 1]. It is optimized for integer arithmetic, so it takes
factor as value from range [0, 1024], multiples by it and divides by 1024. To
avoid integer overflow, it has to extend all values to int32 before
multiplication, and reduce result back to int16 at the end. It uses
__builtin_convertvector to do both conversions:
[code]
#include <stdint.h>
void f(int16_t *buf, unsigned int size, uint32_t k)
{
typedef int16_t v16x8 __attribute__ ((vector_size (16)));
typedef int32_t v32x8 __attribute__ ((vector_size (32)));
if (k > 1024)
__builtin_unreachable();
for (unsigned int n = 0; n < size; n += 8) {
v16x8 v1 = *(v16x8*)(buf + n);
v32x8 v2 = __builtin_convertvector(v1, v32x8);
v2 *= int(k);
v2 /= 1024;
v1 = __builtin_convertvector(v2, v16x8);
*(v16x8*)(buf + n) = v1;
}
}
[/code]
clang 21.1.0 for ARM64 with -O3 gives nice output:
[code]
f(short*, unsigned int, unsigned int):
cbz w1, .LBB0_3
fmov s0, w2
mov x8, xzr
mov w9, w1
.LBB0_2:
ldr q1, [x0]
add x8, x8, #8
cmp x8, x9
sshll v2.4s, v1.4h, #0
sshll2 v1.4s, v1.8h, #0
mul v2.4s, v2.4s, v0.s[0]
mul v1.4s, v1.4s, v0.s[0]
cmlt v3.4s, v2.4s, #0
cmlt v4.4s, v1.4s, #0
usra v2.4s, v3.4s, #22
usra v1.4s, v4.4s, #22
shrn v2.4h, v2.4s, #10
shrn2 v2.8h, v1.4s, #10
str q2, [x0], #16
b.lo .LBB0_2
.LBB0_3:
ret
[/code]
gcc 15.2.0 for ARM64 with -O3 failed to vectorize it:
[code]
f(short*, unsigned int, unsigned int):
cbz w1, .L7
sub sp, sp, #64
mov x6, 0
mov x5, 0
mov x4, 0
mov x3, 0
bfi x6, x2, 0, 32
bfi x5, x2, 0, 32
bfi x4, x2, 0, 32
bfi x3, x2, 0, 32
bfi x6, x2, 32, 32
bfi x5, x2, 32, 32
bfi x4, x2, 32, 32
bfi x3, x2, 32, 32
mov w10, 0
stp x6, x5, [sp, 32]
stp x4, x3, [sp, 48]
.L3:
ubfiz x14, x10, 1, 32
ldr q29, [sp, 32]
mov x15, 0
ldr q31, [x0, x14]
mov x13, 0
mov x12, 0
mov x11, 0
add w10, w10, 8
sxtl v30.4s, v31.4h
sxtl2 v31.4s, v31.8h
mul v30.4s, v30.4s, v29.4s
ldr q29, [sp, 48]
umov w5, v30.s[1]
fmov w2, s30
mul v31.4s, v31.4s, v29.4s
umov w4, v30.s[2]
umov w3, v30.s[3]
cmp w2, 0
add w9, w2, 1023
fmov w6, s31
csel w9, w9, w2, lt
cmp w5, 0
add w8, w4, 1023
umov w17, v31.s[1]
umov w2, v31.s[2]
umov w16, v31.s[3]
asr w9, w9, 10
add w7, w6, 1023
add w6, w5, 1023
csel w6, w6, w5, lt
cmp w4, 0
csel w8, w8, w4, lt
add w5, w3, 1023
cmp w3, 0
add w4, w2, 1023
csel w5, w5, w3, lt
fmov w3, s31
asr w8, w8, 10
bfi x15, x9, 0, 32
asr w6, w6, 10
asr w5, w5, 10
bfi x13, x8, 0, 32
cmp w3, 0
bfi x15, x6, 32, 32
csel w7, w7, w3, lt
cmp w17, 0
add w3, w17, 1023
bfi x13, x5, 32, 32
csel w3, w3, w17, lt
cmp w2, 0
csel w4, w4, w2, lt
cmp w16, 0
add w2, w16, 1023
asr w7, w7, 10
asr w4, w4, 10
csel w2, w2, w16, lt
bfi x12, x7, 0, 32
asr w3, w3, 10
bfi x11, x4, 0, 32
asr w2, w2, 10
bfi x12, x3, 32, 32
stp x15, x13, [sp]
bfi x11, x2, 32, 32
stp x12, x11, [sp, 16]
ldp q31, q30, [sp]
uzp1 v31.8h, v31.8h, v30.8h
str q31, [x0, x14]
cmp w1, w10
bhi .L3
add sp, sp, 64
ret
.L7:
ret
[/code]
For comparison, clang 21.1.0 for x86_64 with -O3 -msse4 gives this:
[code]
f(short*, unsigned int, unsigned int):
test esi, esi
je .LBB0_3
movd xmm0, edx
pshufd xmm0, xmm0, 0
mov eax, esi
xor ecx, ecx
pxor xmm1, xmm1
.LBB0_2:
pmovsxwd xmm2, qword ptr [rdi + 2*rcx]
pmovsxwd xmm3, qword ptr [rdi + 2*rcx + 8]
pmulld xmm3, xmm0
pmulld xmm2, xmm0
movdqa xmm4, xmm2
psrad xmm4, 31
psrld xmm4, 22
paddd xmm4, xmm2
psrld xmm4, 10
movdqa xmm2, xmm3
psrad xmm2, 31
psrld xmm2, 22
paddd xmm2, xmm3
psrld xmm2, 10
pblendw xmm2, xmm1, 170
pblendw xmm4, xmm1, 170
packusdw xmm4, xmm2
movdqa xmmword ptr [rdi + 2*rcx], xmm4
add rcx, 8
cmp rcx, rax
jb .LBB0_2
.LBB0_3:
ret
[/code]
gcc 15.2.0 for x86_64 with -O3 -msse4 produced code which looks like mixture of
regular and vectorized code:
[code]
f(short*, unsigned int, unsigned int):
test esi, esi
je .L7
push rbp
xor ecx, ecx
pxor xmm2, xmm2
mov rbp, rsp
and rsp, -32
mov DWORD PTR [rsp-96], edx
mov DWORD PTR [rsp-92], edx
mov DWORD PTR [rsp-88], edx
mov DWORD PTR [rsp-84], edx
movdqa xmm0, XMMWORD PTR [rsp-96]
mov DWORD PTR [rsp-80], edx
mov DWORD PTR [rsp-76], edx
mov DWORD PTR [rsp-72], edx
mov DWORD PTR [rsp-68], edx
movaps XMMWORD PTR [rsp-32], xmm0
movdqa xmm0, XMMWORD PTR [rsp-80]
movaps XMMWORD PTR [rsp-16], xmm0
.L3:
mov eax, ecx
lea r8, [rdi+rax*2]
movdqa xmm0, XMMWORD PTR [r8]
pmovsxwd xmm1, xmm0
pmulld xmm1, XMMWORD PTR [rsp-32]
movd edx, xmm1
test edx, edx
psrldq xmm0, 8
lea eax, [rdx+1023]
cmovns eax, edx
pextrd edx, xmm1, 1
pmovsxwd xmm0, xmm0
pmulld xmm0, XMMWORD PTR [rsp-16]
sar eax, 10
test edx, edx
mov DWORD PTR [rsp-96], eax
lea eax, [rdx+1023]
cmovns eax, edx
pextrd edx, xmm1, 2
sar eax, 10
test edx, edx
mov DWORD PTR [rsp-92], eax
lea eax, [rdx+1023]
cmovns eax, edx
sar eax, 10
mov DWORD PTR [rsp-88], eax
pextrd eax, xmm1, 3
test eax, eax
lea edx, [rax+1023]
cmovs eax, edx
movd edx, xmm0
sar eax, 10
test edx, edx
mov DWORD PTR [rsp-84], eax
lea eax, [rdx+1023]
cmovns eax, edx
pextrd edx, xmm0, 1
sar eax, 10
test edx, edx
mov DWORD PTR [rsp-80], eax
lea eax, [rdx+1023]
cmovns eax, edx
pextrd edx, xmm0, 2
sar eax, 10
test edx, edx
mov DWORD PTR [rsp-76], eax
lea eax, [rdx+1023]
cmovns eax, edx
sar eax, 10
mov DWORD PTR [rsp-72], eax
pextrd eax, xmm0, 3
movdqa xmm0, XMMWORD PTR [rsp-96]
test eax, eax
lea edx, [rax+1023]
cmovs eax, edx
movaps XMMWORD PTR [rsp-64], xmm0
pblendw xmm0, xmm2, 170
add ecx, 8
sar eax, 10
mov DWORD PTR [rsp-68], eax
movdqa xmm1, XMMWORD PTR [rsp-80]
movaps XMMWORD PTR [rsp-48], xmm1
pblendw xmm1, xmm2, 170
packusdw xmm0, xmm1
movaps XMMWORD PTR [r8], xmm0
cmp ecx, esi
jb .L3
leave
ret
.L7:
ret
[/code]
As a workaround I had to use helper union and do all operations on shorter
vectors. For ARM64 output from clang and gcc was similar. For x86_64 output
from gcc was vectorized, but longer than from clang.
[code]
void f2(int16_t *buf, unsigned int size, uint32_t k)
{
typedef int16_t v16x8 __attribute__ ((vector_size (16)));
typedef int32_t v32x8 __attribute__ ((vector_size (32)));
typedef int32_t v32x4 __attribute__ ((vector_size (16)));
union u {
v32x8 v1;
v32x4 v2[2];
};
if (k > 1024)
__builtin_unreachable();
for (unsigned int n = 0; n < size; n += 8) {
v16x8 v1 = *(v16x8*)(buf + n);
u u;
u.v1 = __builtin_convertvector(v1, v32x8);
u.v2[0] *= int(k);
u.v2[1] *= int(k);
u.v2[0] /= 1024;
u.v2[1] /= 1024;
v1 = __builtin_convertvector(u.v1, v16x8);
*(v16x8*)(buf + n) = v1;
}
}
[/code]