https://gcc.gnu.org/bugzilla/show_bug.cgi?id=103393

            Bug ID: 103393
           Summary: [ 12 Regression ] Auto vectorizer generating 256bit
                    register usage with -mprefer-avx128
                    -mprefer-vector-width=128
           Product: gcc
           Version: 12.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: tree-optimization
          Assignee: unassigned at gcc dot gnu.org
          Reporter: jschoen4 at gmail dot com
  Target Milestone: ---

gcc -v
Using built-in specs.
COLLECT_GCC=/gcc_build/bin/gcc
COLLECT_LTO_WRAPPER=/gcc_build/bin/../libexec/gcc/x86_64-pc-linux-gnu/12.0.0/lto-wrapper
Target: x86_64-pc-linux-gnu
Configured with: ../configure --prefix=/gcc_build --include=/gcc_build/include
--disable-multilib --enable-rpath --enable-__cxa_atexit --enable-nls
--disable-checking --disable-libunwind-exceptions --enable-bootstrap
--enable-shared --enable-static --enable-threads=posix --with-gcc --with-gnu-as
--with-gnu-ld --with-system-zlib
--enable-languages=c,c++,fortran,go,objc,obj-c++ --enable-lto
--enable-stage1-languages=c
Thread model: posix
Supported LTO compression algorithms: zlib
gcc version 12.0.0 20211123 (experimental) (GCC)

Branch: trunk, w/ a latest commit of 721d8b9e26bf8205c1f2125c2626919a408cdbe4

===========
=TEST CODE=
===========
# cat test.cpp
struct TestData {
  float arr[8];
};
void cpy( TestData& s1, TestData& s2 ) {
  for(int i=0; i<8; ++i) {
    s1.arr[i] = s2.arr[i];
  }
}

===========
=cmd      =
===========
gcc -S -masm=intel -O2 -mavx -mprefer-avx128 -mprefer-vector-width=128 -Wall
-Wextra test.cpp -o test.s

===========
=BAD ASM  =
= GCC 12  =
===========
cat test.s
        .file   "test.cpp"
        .intel_syntax noprefix
        .text
        .p2align 4
        .globl  _Z3cpyR8TestDataS0_
        .type   _Z3cpyR8TestDataS0_, @function
_Z3cpyR8TestDataS0_:
.LFB0:
        .cfi_startproc
        vmovdqu ymm0, YMMWORD PTR [rsi]
        vmovdqu YMMWORD PTR [rdi], ymm0
        vzeroupper
        ret
        .cfi_endproc
.LFE0:
        .size   _Z3cpyR8TestDataS0_, .-_Z3cpyR8TestDataS0_
        .ident  "GCC: (GNU) 12.0.0 20211123 (experimental)"
        .section        .note.GNU-stack,"",@progbits

===========
= GCC 11  = (GCC 10 generates identical asm)
===========
cat test.s
        .file   "test.cpp"
        .intel_syntax noprefix
        .text
        .p2align 4
        .globl  _Z3cpyR8TestDataS0_
        .type   _Z3cpyR8TestDataS0_, @function
_Z3cpyR8TestDataS0_:
.LFB0:
        .cfi_startproc
        mov     edx, 32
        jmp     memmove
        .cfi_endproc
.LFE0:
        .size   _Z3cpyR8TestDataS0_, .-_Z3cpyR8TestDataS0_
        .ident  "GCC: (GNU) 11.2.0"
        .section        .note.GNU-stack,"",@progbits

=========
= GCC 9 =
=========
cat test.s
        .file   "test.cpp"
        .intel_syntax noprefix
        .text
        .p2align 4
        .globl  _Z3cpyR8TestDataS0_
        .type   _Z3cpyR8TestDataS0_, @function
_Z3cpyR8TestDataS0_:
.LFB0:
        .cfi_startproc
        xor     eax, eax
        .p2align 4,,10
        .p2align 3
.L2:
        vmovss  xmm0, DWORD PTR [rsi+rax]
        vmovss  DWORD PTR [rdi+rax], xmm0
        add     rax, 4
        cmp     rax, 32
        jne     .L2
        ret
        .cfi_endproc
.LFE0:
        .size   _Z3cpyR8TestDataS0_, .-_Z3cpyR8TestDataS0_
        .ident  "GCC: (GNU) 9.3.0"
        .section        .note.GNU-stack,"",@progbits




The auto vectorizer is generating YMM / 256-bit vector instructions with
-mprefer-avx128 and -mprefer-vector-width=128 flags specified.  This is an
issue for low latency software. Using registers 256-bit and wider causes jitter
CPU problems on sky lake / cascade lake / ice lake chips.  This is true even in
cases where the instructions used are considered avx256-light instructions due
to a "mix of instructions" being used to determine the power levels (this is
also mentioned in intel's optimization manual).

Auto vectorizer needs to respect the prefer width flags.  Enabling/using newer
instruction sets i.e. AVX/AVX2/AVX512 does not require usage of the wider
register types.

Reply via email to