vectorization opportunity (adding a bool to an accumulator)

nicula.iccc at gmail dot com via Gcc-bugs Mon, 25 Nov 2024 12:54:34 -0800

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=117776


            Bug ID: 117776
           Summary: Missed optimization/vectorization opportunity (adding
                    a bool to an accumulator)
           Product: gcc
           Version: 14.2.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: tree-optimization
          Assignee: unassigned at gcc dot gnu.org
          Reporter: nicula.iccc at gmail dot com
  Target Milestone: ---

Consider the following code:

    #include <cstddef>
    #include <cstdint>

    using u64 = uint64_t;
    using u8  = uint8_t;

    const auto is_even_bool = [](u8 n) -> bool {
        return n % 2 == 0;
    };

    const auto is_even_u64 = [](u8 n) -> u64 {
        return n % 2 == 0;
    };

    const auto is_even_convert = [](u8 n) -> u64 {
        return is_even_bool(n);
    };

    u64 count_even_v1(u8 *data, size_t len)
    {
        u64 ret = 0;
        for (size_t i = 0; i < len; i++)
            ret += is_even_bool(data[i]); // Not vectorized
        return ret;
    }

    u64 count_even_v2(u8 *data, size_t len)
    {
        u64 ret = 0;
        for (size_t i = 0; i < len; i++)
            ret += is_even_u64(data[i]); // Vectorized
        return ret;
    }

    u64 count_even_v3(u8 *data, size_t len)
    {
        u64 ret = 0;
        for (size_t i = 0; i < len; i++)
            ret += is_even_convert(data[i]); // Not vectorized
        return ret;
    }

    u64 count_even_v4(u8 *data, size_t len)
    {
        u64 ret = 0;
        for (size_t i = 0; i < len; i++)
            ret += static_cast<u64>(is_even_bool(data[i])); // Not vectorized
        return ret;
    }

The following assembly is generated by g++ 14.2, with flags -O3
-march=skylake-avx512 (godbolt: https://godbolt.org/z/c7W9G6WbW):

    count_even_v1(unsigned char*, unsigned long):
            test    rsi, rsi
            je      .L4
            add     rsi, rdi
            xor     edx, edx
    .L3:
            movzx   eax, BYTE PTR [rdi]
            inc     rdi
            not     eax
            and     eax, 1
            add     rdx, rax
            cmp     rsi, rdi
            jne     .L3
            mov     rax, rdx
            ret
    .L4:
            xor     edx, edx
            mov     rax, rdx
            ret
    count_even_v2(unsigned char*, unsigned long):
            mov     rcx, rsi
            test    rsi, rsi
            je      .L15
            lea     rax, [rsi-1]
            cmp     rax, 30
            jbe     .L16
            mov     rdx, rsi
            mov     r8d, 16843009
            vpxor   xmm5, xmm5, xmm5
            mov     rax, rdi
            and     rdx, -32
            vpbroadcastd    ymm6, r8d
            lea     rsi, [rdx+rdi]
    .L10:
            vmovdqa ymm0, ymm6
            add     rax, 32
            vpternlogd      ymm0, ymm0, YMMWORD PTR [rax-32], $0x44
            vpmovzxbw       ymm1, xmm0
            vextracti32x4   xmm0, ymm0, 0x1
            vpmovzxwd       ymm3, xmm1
            vpmovzxbw       ymm0, xmm0
            vextracti32x4   xmm1, ymm1, 0x1
            vpmovzxdq       ymm2, xmm3
            vextracti32x4   xmm3, ymm3, 0x1
            vpmovzxwd       ymm4, xmm0
            vpmovzxdq       ymm3, xmm3
            vextracti32x4   xmm0, ymm0, 0x1
            vpmovzxwd       ymm1, xmm1
            vpmovzxwd       ymm0, xmm0
            vpaddq  ymm2, ymm2, ymm3
            vpmovzxdq       ymm3, xmm4
            vpaddq  ymm2, ymm2, ymm3
            vpmovzxdq       ymm3, xmm0
            vextracti32x4   xmm4, ymm4, 0x1
            vpaddq  ymm2, ymm2, ymm3
            vpmovzxdq       ymm3, xmm1
            vextracti32x4   xmm1, ymm1, 0x1
            vpmovzxdq       ymm1, xmm1
            vpmovzxdq       ymm4, xmm4
            vextracti32x4   xmm0, ymm0, 0x1
            vpaddq  ymm1, ymm3, ymm1
            vpmovzxdq       ymm0, xmm0
            vpaddq  ymm1, ymm1, ymm4
            vpaddq  ymm0, ymm1, ymm0
            vpaddq  ymm0, ymm2, ymm0
            vpaddq  ymm5, ymm5, ymm0
            cmp     rax, rsi
            jne     .L10
            vextracti64x2   xmm0, ymm5, 0x1
            vpaddq  xmm0, xmm0, xmm5
            vpsrldq xmm1, xmm0, 8
            vmovdqa xmm3, xmm0
            vpaddq  xmm1, xmm0, xmm1
            vmovq   rax, xmm1
            cmp     rcx, rdx
            je      .L22
            vzeroupper
    .L9:
            mov     rsi, rcx
            sub     rsi, rdx
            lea     r8, [rsi-1]
            cmp     r8, 14
            jbe     .L13
            mov     eax, 16843009
            mov     r8, rsi
            vpbroadcastd    xmm0, eax
            and     r8, -16
            vpternlogd      xmm0, xmm0, XMMWORD PTR [rdi+rdx], $0x44
            add     rdx, r8
            and     esi, 15
            vpmovzxbw       xmm2, xmm0
            vpsrldq xmm0, xmm0, 8
            vpmovzxwd       xmm4, xmm2
            vpsrldq xmm2, xmm2, 8
            vpmovzxbw       xmm0, xmm0
            vpmovzxdq       xmm1, xmm4
            vpsrldq xmm4, xmm4, 8
            vpmovzxwd       xmm5, xmm0
            vpmovzxdq       xmm4, xmm4
            vpsrldq xmm0, xmm0, 8
            vpmovzxwd       xmm2, xmm2
            vpaddq  xmm1, xmm1, xmm4
            vpsrldq xmm4, xmm5, 8
            vpmovzxwd       xmm0, xmm0
            vpmovzxdq       xmm4, xmm4
            vpmovzxdq       xmm5, xmm5
            vpaddq  xmm1, xmm1, xmm4
            vpmovzxdq       xmm4, xmm0
            vpaddq  xmm1, xmm1, xmm4
            vpsrldq xmm0, xmm0, 8
            vpaddq  xmm1, xmm1, xmm3
            vpmovzxdq       xmm3, xmm2
            vpmovzxdq       xmm0, xmm0
            vpsrldq xmm2, xmm2, 8
            vpmovzxdq       xmm2, xmm2
            vpaddq  xmm2, xmm3, xmm2
            vpaddq  xmm2, xmm2, xmm5
            vpaddq  xmm0, xmm2, xmm0
            vpaddq  xmm0, xmm1, xmm0
            vpsrldq xmm1, xmm0, 8
            vpaddq  xmm0, xmm0, xmm1
            vmovq   rax, xmm0
            je      .L7
    .L13:
            movzx   esi, BYTE PTR [rdi+rdx]
            not     esi
            and     esi, 1
            add     rax, rsi
            lea     rsi, [rdx+1]
            cmp     rsi, rcx
            jnb     .L7
            movzx   esi, BYTE PTR [rdi+1+rdx]
            not     esi
            and     esi, 1
            add     rax, rsi
            lea     rsi, [rdx+2]
            cmp     rsi, rcx
            jnb     .L7
            movzx   esi, BYTE PTR [rdi+2+rdx]
            not     esi
            and     esi, 1
            add     rax, rsi
            lea     rsi, [rdx+3]
            cmp     rsi, rcx
            jnb     .L7
            movzx   esi, BYTE PTR [rdi+3+rdx]
            not     esi
            and     esi, 1
            add     rax, rsi
            lea     rsi, [rdx+4]
            cmp     rsi, rcx
            jnb     .L7
            movzx   esi, BYTE PTR [rdi+4+rdx]
            not     esi
            and     esi, 1
            add     rax, rsi
            lea     rsi, [rdx+5]
            cmp     rsi, rcx
            jnb     .L7
            movzx   esi, BYTE PTR [rdi+5+rdx]
            not     esi
            and     esi, 1
            add     rax, rsi
            lea     rsi, [rdx+6]
            cmp     rsi, rcx
            jnb     .L7
            movzx   esi, BYTE PTR [rdi+6+rdx]
            not     esi
            and     esi, 1
            add     rax, rsi
            lea     rsi, [rdx+7]
            cmp     rsi, rcx
            jnb     .L7
            movzx   esi, BYTE PTR [rdi+7+rdx]
            not     esi
            and     esi, 1
            add     rax, rsi
            lea     rsi, [rdx+8]
            cmp     rsi, rcx
            jnb     .L7
            movzx   esi, BYTE PTR [rdi+8+rdx]
            not     esi
            and     esi, 1
            add     rax, rsi
            lea     rsi, [rdx+9]
            cmp     rsi, rcx
            jnb     .L7
            movzx   esi, BYTE PTR [rdi+9+rdx]
            not     esi
            and     esi, 1
            add     rax, rsi
            lea     rsi, [rdx+10]
            cmp     rsi, rcx
            jnb     .L7
            movzx   esi, BYTE PTR [rdi+10+rdx]
            not     esi
            and     esi, 1
            add     rax, rsi
            lea     rsi, [rdx+11]
            cmp     rsi, rcx
            jnb     .L7
            movzx   esi, BYTE PTR [rdi+11+rdx]
            not     esi
            and     esi, 1
            add     rax, rsi
            lea     rsi, [rdx+12]
            cmp     rsi, rcx
            jnb     .L7
            movzx   esi, BYTE PTR [rdi+12+rdx]
            not     esi
            and     esi, 1
            add     rax, rsi
            lea     rsi, [rdx+13]
            cmp     rsi, rcx
            jnb     .L7
            movzx   esi, BYTE PTR [rdi+13+rdx]
            not     esi
            and     esi, 1
            add     rax, rsi
            lea     rsi, [rdx+14]
            cmp     rsi, rcx
            jnb     .L7
            movzx   edx, BYTE PTR [rdi+14+rdx]
            not     edx
            and     edx, 1
            add     rax, rdx
            ret
    .L15:
            xor     eax, eax
    .L7:
            ret
    .L16:
            vpxor   xmm3, xmm3, xmm3
            xor     edx, edx
            xor     eax, eax
            jmp     .L9
    .L22:
            vzeroupper
            ret
    count_even_v3(unsigned char*, unsigned long):
            test    rsi, rsi
            je      .L26
            add     rsi, rdi
            xor     edx, edx
    .L25:
            movzx   eax, BYTE PTR [rdi]
            inc     rdi
            not     eax
            and     eax, 1
            add     rdx, rax
            cmp     rsi, rdi
            jne     .L25
            mov     rax, rdx
            ret
    .L26:
            xor     edx, edx
            mov     rax, rdx
            ret
    count_even_v4(unsigned char*, unsigned long):
            test    rsi, rsi
            je      .L31
            add     rsi, rdi
            xor     edx, edx
    .L30:
            movzx   eax, BYTE PTR [rdi]
            inc     rdi
            not     eax
            and     eax, 1
            add     rdx, rax
            cmp     rsi, rdi
            jne     .L30
            mov     rax, rdx
            ret
    .L31:
            xor     edx, edx
            mov     rax, rdx
            ret

So why is only count_even_v2() successfully vectorized? For example, shouldn't
the compiler have an easy time seeing that:

    ret += is_even_bool(data[i]);

is essentially the same thing as:

    ret += is_even_u64(data[i]);

because is_even_bool(data[i]) will be converted to an u64 before doing the
add-assignment? Clang generates identical code for all four versions and
successfully vectorizes them.

Also, an interesting observation is that g++ won't fail to vectorize the same
code, with the exception that the type of the accumulator is the same as the
type of the data (i.e. we have "u32 *data" and "u32 ret", for example). In this
case, all four versions are vectorized: https://godbolt.org/z/6WenP9Y3T

Note: the same things happen with g++ trunk

[Bug tree-optimization/117776] New: Missed optimization/vectorization opportunity (adding a bool to an accumulator)

Reply via email to