https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108953

            Bug ID: 108953
           Summary: inefficient codegen for trivial equality
           Product: gcc
           Version: 12.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: c++
          Assignee: unassigned at gcc dot gnu.org
          Reporter: barry.revzin at gmail dot com
  Target Milestone: ---

Consider this example:

#include <cstdint>
#include <cstddef>
#include <string.h>

struct C
{
   uint8_t a;
   uint8_t b;
   uint8_t c;
   uint8_t d;
   uint16_t e;
   uint16_t f;
   int32_t g;

   bool operator==(C const&) const = default;
};

bool check(C const& lhs, C const& rhs) {
    #ifdef MEMCMP
    return memcmp(&lhs, &rhs, sizeof(lhs)) == 0;
    #else
    return lhs == rhs;
    #endif
}

There are two implementations of check here, but lead to suboptimal code.

When using MEMCMP, gcc trunk -O3 emits:

check(C const&, C const&):
        mov     rax, QWORD PTR [rsi]
        cmp     QWORD PTR [rdi], rax
        je      .L5
.L2:
        mov     eax, 1
        test    eax, eax
        sete    al
        ret
.L5:
        mov     eax, DWORD PTR [rsi+8]
        cmp     DWORD PTR [rdi+8], eax
        jne     .L2
        xor     eax, eax
        test    eax, eax
        sete    al
        ret

There's a few extra instructions here (mov eax, 1; test eax, eax; sete al;...
do we need all three of those to return 0?)

When using defaulted comparisons, gcc trunk -O3 doesn't collapse any of the
comparisons, and instead emits 7 distinct checks:

check(C const&, C const&):
        movzx   ecx, BYTE PTR [rsi]
        xor     eax, eax
        cmp     BYTE PTR [rdi], cl
        jne     .L1
        movzx   edx, BYTE PTR [rsi+1]
        cmp     BYTE PTR [rdi+1], dl
        jne     .L1
        movzx   edx, BYTE PTR [rsi+2]
        cmp     BYTE PTR [rdi+2], dl
        jne     .L1
        movzx   edx, BYTE PTR [rsi+3]
        cmp     BYTE PTR [rdi+3], dl
        jne     .L1
        movzx   edx, WORD PTR [rsi+4]
        cmp     WORD PTR [rdi+4], dx
        jne     .L1
        movzx   eax, WORD PTR [rsi+6]
        cmp     WORD PTR [rdi+6], ax
        mov     edx, DWORD PTR [rsi+8]
        sete    al
        cmp     DWORD PTR [rdi+8], edx
        sete    dl
        and     eax, edx
.L1:
        ret

Compare this to clang, which for both the memcmp and the default equality
versions emits this:

check(C const&, C const&):                        # @check(C const&, C const&)
        mov     rax, qword ptr [rdi]
        xor     rax, qword ptr [rsi]
        mov     ecx, dword ptr [rdi + 8]
        xor     ecx, dword ptr [rsi + 8]
        or      rcx, rax
        sete    al
        ret

Looks like there are two missing optimizations here for gcc: (1) the memcmp
does get optimized into an 8-byte and 4-byte comparison, but then the result of
that optimization doesn't get optimized further and (2) multiple trivial
comparisons don't get coalesced together.

Reply via email to