https://gcc.gnu.org/bugzilla/show_bug.cgi?id=117776
Bug ID: 117776
Summary: Missed optimization/vectorization opportunity (adding
a bool to an accumulator)
Product: gcc
Version: 14.2.0
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: tree-optimization
Assignee: unassigned at gcc dot gnu.org
Reporter: nicula.iccc at gmail dot com
Target Milestone: ---
Consider the following code:
#include <cstddef>
#include <cstdint>
using u64 = uint64_t;
using u8 = uint8_t;
const auto is_even_bool = [](u8 n) -> bool {
return n % 2 == 0;
};
const auto is_even_u64 = [](u8 n) -> u64 {
return n % 2 == 0;
};
const auto is_even_convert = [](u8 n) -> u64 {
return is_even_bool(n);
};
u64 count_even_v1(u8 *data, size_t len)
{
u64 ret = 0;
for (size_t i = 0; i < len; i++)
ret += is_even_bool(data[i]); // Not vectorized
return ret;
}
u64 count_even_v2(u8 *data, size_t len)
{
u64 ret = 0;
for (size_t i = 0; i < len; i++)
ret += is_even_u64(data[i]); // Vectorized
return ret;
}
u64 count_even_v3(u8 *data, size_t len)
{
u64 ret = 0;
for (size_t i = 0; i < len; i++)
ret += is_even_convert(data[i]); // Not vectorized
return ret;
}
u64 count_even_v4(u8 *data, size_t len)
{
u64 ret = 0;
for (size_t i = 0; i < len; i++)
ret += static_cast<u64>(is_even_bool(data[i])); // Not vectorized
return ret;
}
The following assembly is generated by g++ 14.2, with flags -O3
-march=skylake-avx512 (godbolt: https://godbolt.org/z/c7W9G6WbW):
count_even_v1(unsigned char*, unsigned long):
test rsi, rsi
je .L4
add rsi, rdi
xor edx, edx
.L3:
movzx eax, BYTE PTR [rdi]
inc rdi
not eax
and eax, 1
add rdx, rax
cmp rsi, rdi
jne .L3
mov rax, rdx
ret
.L4:
xor edx, edx
mov rax, rdx
ret
count_even_v2(unsigned char*, unsigned long):
mov rcx, rsi
test rsi, rsi
je .L15
lea rax, [rsi-1]
cmp rax, 30
jbe .L16
mov rdx, rsi
mov r8d, 16843009
vpxor xmm5, xmm5, xmm5
mov rax, rdi
and rdx, -32
vpbroadcastd ymm6, r8d
lea rsi, [rdx+rdi]
.L10:
vmovdqa ymm0, ymm6
add rax, 32
vpternlogd ymm0, ymm0, YMMWORD PTR [rax-32], $0x44
vpmovzxbw ymm1, xmm0
vextracti32x4 xmm0, ymm0, 0x1
vpmovzxwd ymm3, xmm1
vpmovzxbw ymm0, xmm0
vextracti32x4 xmm1, ymm1, 0x1
vpmovzxdq ymm2, xmm3
vextracti32x4 xmm3, ymm3, 0x1
vpmovzxwd ymm4, xmm0
vpmovzxdq ymm3, xmm3
vextracti32x4 xmm0, ymm0, 0x1
vpmovzxwd ymm1, xmm1
vpmovzxwd ymm0, xmm0
vpaddq ymm2, ymm2, ymm3
vpmovzxdq ymm3, xmm4
vpaddq ymm2, ymm2, ymm3
vpmovzxdq ymm3, xmm0
vextracti32x4 xmm4, ymm4, 0x1
vpaddq ymm2, ymm2, ymm3
vpmovzxdq ymm3, xmm1
vextracti32x4 xmm1, ymm1, 0x1
vpmovzxdq ymm1, xmm1
vpmovzxdq ymm4, xmm4
vextracti32x4 xmm0, ymm0, 0x1
vpaddq ymm1, ymm3, ymm1
vpmovzxdq ymm0, xmm0
vpaddq ymm1, ymm1, ymm4
vpaddq ymm0, ymm1, ymm0
vpaddq ymm0, ymm2, ymm0
vpaddq ymm5, ymm5, ymm0
cmp rax, rsi
jne .L10
vextracti64x2 xmm0, ymm5, 0x1
vpaddq xmm0, xmm0, xmm5
vpsrldq xmm1, xmm0, 8
vmovdqa xmm3, xmm0
vpaddq xmm1, xmm0, xmm1
vmovq rax, xmm1
cmp rcx, rdx
je .L22
vzeroupper
.L9:
mov rsi, rcx
sub rsi, rdx
lea r8, [rsi-1]
cmp r8, 14
jbe .L13
mov eax, 16843009
mov r8, rsi
vpbroadcastd xmm0, eax
and r8, -16
vpternlogd xmm0, xmm0, XMMWORD PTR [rdi+rdx], $0x44
add rdx, r8
and esi, 15
vpmovzxbw xmm2, xmm0
vpsrldq xmm0, xmm0, 8
vpmovzxwd xmm4, xmm2
vpsrldq xmm2, xmm2, 8
vpmovzxbw xmm0, xmm0
vpmovzxdq xmm1, xmm4
vpsrldq xmm4, xmm4, 8
vpmovzxwd xmm5, xmm0
vpmovzxdq xmm4, xmm4
vpsrldq xmm0, xmm0, 8
vpmovzxwd xmm2, xmm2
vpaddq xmm1, xmm1, xmm4
vpsrldq xmm4, xmm5, 8
vpmovzxwd xmm0, xmm0
vpmovzxdq xmm4, xmm4
vpmovzxdq xmm5, xmm5
vpaddq xmm1, xmm1, xmm4
vpmovzxdq xmm4, xmm0
vpaddq xmm1, xmm1, xmm4
vpsrldq xmm0, xmm0, 8
vpaddq xmm1, xmm1, xmm3
vpmovzxdq xmm3, xmm2
vpmovzxdq xmm0, xmm0
vpsrldq xmm2, xmm2, 8
vpmovzxdq xmm2, xmm2
vpaddq xmm2, xmm3, xmm2
vpaddq xmm2, xmm2, xmm5
vpaddq xmm0, xmm2, xmm0
vpaddq xmm0, xmm1, xmm0
vpsrldq xmm1, xmm0, 8
vpaddq xmm0, xmm0, xmm1
vmovq rax, xmm0
je .L7
.L13:
movzx esi, BYTE PTR [rdi+rdx]
not esi
and esi, 1
add rax, rsi
lea rsi, [rdx+1]
cmp rsi, rcx
jnb .L7
movzx esi, BYTE PTR [rdi+1+rdx]
not esi
and esi, 1
add rax, rsi
lea rsi, [rdx+2]
cmp rsi, rcx
jnb .L7
movzx esi, BYTE PTR [rdi+2+rdx]
not esi
and esi, 1
add rax, rsi
lea rsi, [rdx+3]
cmp rsi, rcx
jnb .L7
movzx esi, BYTE PTR [rdi+3+rdx]
not esi
and esi, 1
add rax, rsi
lea rsi, [rdx+4]
cmp rsi, rcx
jnb .L7
movzx esi, BYTE PTR [rdi+4+rdx]
not esi
and esi, 1
add rax, rsi
lea rsi, [rdx+5]
cmp rsi, rcx
jnb .L7
movzx esi, BYTE PTR [rdi+5+rdx]
not esi
and esi, 1
add rax, rsi
lea rsi, [rdx+6]
cmp rsi, rcx
jnb .L7
movzx esi, BYTE PTR [rdi+6+rdx]
not esi
and esi, 1
add rax, rsi
lea rsi, [rdx+7]
cmp rsi, rcx
jnb .L7
movzx esi, BYTE PTR [rdi+7+rdx]
not esi
and esi, 1
add rax, rsi
lea rsi, [rdx+8]
cmp rsi, rcx
jnb .L7
movzx esi, BYTE PTR [rdi+8+rdx]
not esi
and esi, 1
add rax, rsi
lea rsi, [rdx+9]
cmp rsi, rcx
jnb .L7
movzx esi, BYTE PTR [rdi+9+rdx]
not esi
and esi, 1
add rax, rsi
lea rsi, [rdx+10]
cmp rsi, rcx
jnb .L7
movzx esi, BYTE PTR [rdi+10+rdx]
not esi
and esi, 1
add rax, rsi
lea rsi, [rdx+11]
cmp rsi, rcx
jnb .L7
movzx esi, BYTE PTR [rdi+11+rdx]
not esi
and esi, 1
add rax, rsi
lea rsi, [rdx+12]
cmp rsi, rcx
jnb .L7
movzx esi, BYTE PTR [rdi+12+rdx]
not esi
and esi, 1
add rax, rsi
lea rsi, [rdx+13]
cmp rsi, rcx
jnb .L7
movzx esi, BYTE PTR [rdi+13+rdx]
not esi
and esi, 1
add rax, rsi
lea rsi, [rdx+14]
cmp rsi, rcx
jnb .L7
movzx edx, BYTE PTR [rdi+14+rdx]
not edx
and edx, 1
add rax, rdx
ret
.L15:
xor eax, eax
.L7:
ret
.L16:
vpxor xmm3, xmm3, xmm3
xor edx, edx
xor eax, eax
jmp .L9
.L22:
vzeroupper
ret
count_even_v3(unsigned char*, unsigned long):
test rsi, rsi
je .L26
add rsi, rdi
xor edx, edx
.L25:
movzx eax, BYTE PTR [rdi]
inc rdi
not eax
and eax, 1
add rdx, rax
cmp rsi, rdi
jne .L25
mov rax, rdx
ret
.L26:
xor edx, edx
mov rax, rdx
ret
count_even_v4(unsigned char*, unsigned long):
test rsi, rsi
je .L31
add rsi, rdi
xor edx, edx
.L30:
movzx eax, BYTE PTR [rdi]
inc rdi
not eax
and eax, 1
add rdx, rax
cmp rsi, rdi
jne .L30
mov rax, rdx
ret
.L31:
xor edx, edx
mov rax, rdx
ret
So why is only count_even_v2() successfully vectorized? For example, shouldn't
the compiler have an easy time seeing that:
ret += is_even_bool(data[i]);
is essentially the same thing as:
ret += is_even_u64(data[i]);
because is_even_bool(data[i]) will be converted to an u64 before doing the
add-assignment? Clang generates identical code for all four versions and
successfully vectorizes them.
Also, an interesting observation is that g++ won't fail to vectorize the same
code, with the exception that the type of the accumulator is the same as the
type of the data (i.e. we have "u32 *data" and "u32 ret", for example). In this
case, all four versions are vectorized: https://godbolt.org/z/6WenP9Y3T
Note: the same things happen with g++ trunk