https://gcc.gnu.org/bugzilla/show_bug.cgi?id=91897
Bug ID: 91897 Summary: Very poor optimization on large attribute vector_size Product: gcc Version: 9.2.1 Status: UNCONFIRMED Severity: normal Priority: P3 Component: c++ Assignee: unassigned at gcc dot gnu.org Reporter: warp at iki dot fi Target Milestone: --- Consider the following code: //----------------------------------------------------------- typedef double Double16 __attribute__((vector_size(8*16))); Double16 mult(const Double16& v1, const Double16& v2) { return v1 * v2; } //----------------------------------------------------------- Using the compiler options "-Ofast -march=skylake", clang 9.0.0 produces this output from it: //----------------------------------------------------------- vmovapd ymm0, ymmword ptr [rsi] vmovapd ymm1, ymmword ptr [rsi + 32] vmovapd ymm2, ymmword ptr [rsi + 64] vmovapd ymm3, ymmword ptr [rsi + 96] vmulpd ymm0, ymm0, ymmword ptr [rdi] vmulpd ymm1, ymm1, ymmword ptr [rdi + 32] vmulpd ymm2, ymm2, ymmword ptr [rdi + 64] vmulpd ymm3, ymm3, ymmword ptr [rdi + 96] ret //----------------------------------------------------------- However, gcc 9.2 produces the following output: //----------------------------------------------------------- push rbp mov rax, rdi mov rbp, rsp and rsp, -128 sub rsp, 392 vmovdqa xmm5, XMMWORD PTR [rsi] vmovdqa xmm6, XMMWORD PTR [rsi+16] vmovdqa xmm7, XMMWORD PTR [rsi+32] vmovdqa xmm1, XMMWORD PTR [rsi+48] vmovdqa xmm2, XMMWORD PTR [rsi+64] vmovdqa xmm3, XMMWORD PTR [rsi+80] vmovdqa xmm4, XMMWORD PTR [rsi+96] vmovaps XMMWORD PTR [rsp+8], xmm5 vmovaps XMMWORD PTR [rsp+24], xmm6 vmovdqa xmm5, XMMWORD PTR [rsi+112] vmovdqa xmm6, XMMWORD PTR [rdx] vmovaps XMMWORD PTR [rsp+40], xmm7 vmovaps XMMWORD PTR [rsp+56], xmm1 vmovdqa xmm7, XMMWORD PTR [rdx+16] vmovdqa xmm1, XMMWORD PTR [rdx+32] vmovaps XMMWORD PTR [rsp+72], xmm2 vmovaps XMMWORD PTR [rsp+88], xmm3 vmovdqa xmm2, XMMWORD PTR [rdx+48] vmovdqa xmm3, XMMWORD PTR [rdx+64] vmovaps XMMWORD PTR [rsp+104], xmm4 vmovdqa xmm4, XMMWORD PTR [rdx+80] vmovaps XMMWORD PTR [rsp+136], xmm6 vmovaps XMMWORD PTR [rsp+152], xmm7 vmovaps XMMWORD PTR [rsp+168], xmm1 vmovaps XMMWORD PTR [rsp+184], xmm2 vmovaps XMMWORD PTR [rsp+200], xmm3 vmovaps XMMWORD PTR [rsp+216], xmm4 vmovaps XMMWORD PTR [rsp+120], xmm5 vmovdqa xmm5, XMMWORD PTR [rdx+96] vmovapd ymm7, YMMWORD PTR [rsp+8] vmovapd ymm1, YMMWORD PTR [rsp+40] vmulpd ymm0, ymm7, YMMWORD PTR [rsp+136] vmovapd ymm2, YMMWORD PTR [rsp+72] vmovdqa xmm6, XMMWORD PTR [rdx+112] vmovaps XMMWORD PTR [rsp+232], xmm5 vmovapd ymm5, YMMWORD PTR [rsp+104] vmovdqa xmm4, xmm0 vmovapd YMMWORD PTR [rsp-120], ymm0 vmulpd ymm0, ymm1, YMMWORD PTR [rsp+168] vmovaps XMMWORD PTR [rsp+248], xmm6 vmovaps XMMWORD PTR [rdi], xmm4 vmovdqa xmm4, XMMWORD PTR [rsp-104] vmovdqa xmm3, xmm0 vmovapd YMMWORD PTR [rsp-88], ymm0 vmulpd ymm0, ymm2, YMMWORD PTR [rsp+200] vmovaps XMMWORD PTR [rdi+32], xmm3 vmovdqa xmm3, XMMWORD PTR [rsp-72] vmovaps XMMWORD PTR [rdi+16], xmm4 vmovaps XMMWORD PTR [rdi+48], xmm3 vmovdqa xmm2, xmm0 vmovapd YMMWORD PTR [rsp-56], ymm0 vmulpd ymm0, ymm5, YMMWORD PTR [rsp+232] vmovdqa xmm6, XMMWORD PTR [rsp-40] vmovaps XMMWORD PTR [rdi+64], xmm2 vmovaps XMMWORD PTR [rdi+80], xmm6 vmovapd YMMWORD PTR [rsp-24], ymm0 vmovdqa xmm7, XMMWORD PTR [rsp-8] vmovaps XMMWORD PTR [rdi+96], xmm0 vmovaps XMMWORD PTR [rdi+112], xmm7 vzeroupper leave ret //----------------------------------------------------------- Curiously, the current trunk version of gcc available at godbolt as of writing this produces this instead: //----------------------------------------------------------- push rbp mov rax, rdi mov rbp, rsp and rsp, -32 sub rsp, 8 vmovapd ymm0, YMMWORD PTR [rsi] vmovapd ymm2, YMMWORD PTR [rsi+32] vmulpd ymm7, ymm0, YMMWORD PTR [rdx] vmulpd ymm1, ymm2, YMMWORD PTR [rdx+32] vmovapd ymm4, YMMWORD PTR [rsi+64] vmovapd ymm6, YMMWORD PTR [rsi+96] vmulpd ymm3, ymm4, YMMWORD PTR [rdx+64] vmovapd YMMWORD PTR [rsp-120], ymm7 mov rcx, QWORD PTR [rsp-112] vmovdqa xmm0, XMMWORD PTR [rsp-120] mov QWORD PTR [rdi+8], rcx mov rcx, QWORD PTR [rsp-104] vmulpd ymm5, ymm6, YMMWORD PTR [rdx+96] vmovapd YMMWORD PTR [rsp-24], ymm1 mov QWORD PTR [rdi+16], rcx vmovq QWORD PTR [rdi], xmm0 mov rcx, QWORD PTR [rsp-16] mov rdi, QWORD PTR [rsp-96] mov QWORD PTR [rax+40], rcx mov QWORD PTR [rax+24], rdi mov rcx, QWORD PTR [rsp] mov rdi, QWORD PTR [rsp-8] vmovdqa xmm0, XMMWORD PTR [rsp-24] vmovapd YMMWORD PTR [rsp-56], ymm3 mov QWORD PTR [rax+48], rdi mov QWORD PTR [rax+56], rcx vmovapd YMMWORD PTR [rsp-88], ymm5 vmovq QWORD PTR [rax+32], xmm0 vmovdqa xmm0, XMMWORD PTR [rsp-56] mov rdi, QWORD PTR [rsp-48] mov rdx, QWORD PTR [rsp-40] mov QWORD PTR [rax+72], rdi mov QWORD PTR [rax+80], rdx mov rcx, QWORD PTR [rsp-32] mov rsi, QWORD PTR [rsp-80] mov rdi, QWORD PTR [rsp-72] mov rdx, QWORD PTR [rsp-64] vmovq QWORD PTR [rax+64], xmm0 vmovdqa xmm0, XMMWORD PTR [rsp-88] mov QWORD PTR [rax+88], rcx mov QWORD PTR [rax+104], rsi mov QWORD PTR [rax+112], rdi mov QWORD PTR [rax+120], rdx vmovq QWORD PTR [rax+96], xmm0 vzeroupper leave ret //-----------------------------------------------------------