https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89226

            Bug ID: 89226
           Summary: codegen for copying a 512-bit object fails to use avx
                    instructions
           Product: gcc
           Version: 9.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: c++
          Assignee: unassigned at gcc dot gnu.org
          Reporter: barry.revzin at gmail dot com
  Target Milestone: ---

Consider the following example:

#include <x86intrin.h>

// DUMB PAIR
struct dumb_pair {
    alignas(2*sizeof(__m256i)) __m256i x[2];
};

void copy1(const dumb_pair& from, dumb_pair& to) {
    to = from;
}

// SMART PAIR
struct foo512 {
    __m256i a;
    __m256i b;

    auto& operator=(const foo512& f) {
        a = f.a;
        b = f.b;
        return *this;
    }
};

struct smart_pair {
    union {
        foo512 y;
        __m256i x[2];
    };
    smart_pair(const smart_pair& sp) {
        y = sp.y;
    }

    smart_pair& operator=(const smart_pair& sp) {
        y = sp.y;
        return *this;
    }
};

void copy2(const smart_pair& from, smart_pair& to) {
    to = from;
}

when compiled with:

g++ -mavx -O3 -march=corei7-avx -mtune=corei7-avx

on latest gcc (either trunk or 8.2 or 7.4) emits
(https://godbolt.org/z/mZj4VU):

copy1(dumb_pair const&, dumb_pair&):
        vmovdqa xmm0, XMMWORD PTR [rdi]
        vmovaps XMMWORD PTR [rsi], xmm0
        vmovdqa xmm1, XMMWORD PTR [rdi+16]
        vmovaps XMMWORD PTR [rsi+16], xmm1
        vmovdqa xmm2, XMMWORD PTR [rdi+32]
        vmovaps XMMWORD PTR [rsi+32], xmm2
        vmovdqa xmm3, XMMWORD PTR [rdi+48]
        vmovaps XMMWORD PTR [rsi+48], xmm3
        ret
copy2(smart_pair const&, smart_pair&):
        vmovdqa ymm0, YMMWORD PTR [rdi]
        vmovdqa ymm1, YMMWORD PTR [rdi+32]
        vmovdqa YMMWORD PTR [rsi], ymm0
        vmovdqa YMMWORD PTR [rsi+32], ymm1
        vzeroupper
        ret

copy2() is better than copy1(). If we remove the user-provided copy assignment
operator from foo512 (even though the user-provided implementation is the same
as the default), the smart_pair code becomes the same as dumb_pair code.

clang++ emits the same code in both cases: the same code as copy2() in this
example.

Reply via email to