https://gcc.gnu.org/bugzilla/show_bug.cgi?id=81501
Bug ID: 81501 Summary: Unneccessary calls to __tls_get_addr() in simple thread-singleton pattern Product: gcc Version: 7.1.0 Status: UNCONFIRMED Severity: normal Priority: P3 Component: target Assignee: unassigned at gcc dot gnu.org Reporter: j...@jak-linux.org Target Milestone: --- I only tested this on amd64, but see for yourself: + cat t.cc struct foo { foo(); ~foo(); }; foo *test() { static thread_local foo foo_tls; return &foo_tls; } + g++-7 -std=c++14 -v -pthread -fPIC -shared -O2 -o gcc.so t.cc Using built-in specs. COLLECT_GCC=/usr/bin/g++-7 COLLECT_LTO_WRAPPER=/usr/lib/gcc/x86_64-linux-gnu/7/lto-wrapper OFFLOAD_TARGET_NAMES=nvptx-none OFFLOAD_TARGET_DEFAULT=1 Target: x86_64-linux-gnu Configured with: ../src/configure -v --with-pkgversion='Debian 7.1.0-9' --with-bugurl=file:///usr/share/doc/gcc-7/README.Bugs --enable-languages=c,ada,c++,go,brig,d,fortran,objc,obj-c++ --prefix=/usr --with-gcc-major-version-only --program-suffix=-7 --program-prefix=x86_64-linux-gnu- --enable-shared --enable-linker-build-id --libexecdir=/usr/lib --without-included-gettext --enable-threads=posix --libdir=/usr/lib --enable-nls --with-sysroot=/ --enable-clocale=gnu --enable-libstdcxx-debug --enable-libstdcxx-time=yes --with-default-libstdcxx-abi=new --enable-gnu-unique-object --disable-vtable-verify --enable-libmpx --enable-plugin --enable-default-pie --with-system-zlib --with-target-system-zlib --enable-objc-gc=auto --enable-multiarch --disable-werror --with-arch-32=i686 --with-abi=m64 --with-multilib-list=m32,m64,mx32 --enable-multilib --with-tune=generic --enable-offload-targets=nvptx-none --without-cuda-driver --enable-checking=release --build=x86_64-linux-gnu --host=x86_64-linux-gnu --target=x86_64-linux-gnu Thread model: posix gcc version 7.1.0 (Debian 7.1.0-9) COLLECT_GCC_OPTIONS='-std=c++14' '-v' '-pthread' '-fPIC' '-shared' '-O2' '-o' 'gcc.so' '-shared-libgcc' '-mtune=generic' '-march=x86-64' /usr/lib/gcc/x86_64-linux-gnu/7/cc1plus -quiet -v -imultiarch x86_64-linux-gnu -D_GNU_SOURCE -D_REENTRANT t.cc -quiet -dumpbase t.cc -mtune=generic -march=x86-64 -auxbase t -O2 -std=c++14 -version -fPIC -o /tmp/ccdUrCDS.s GNU C++14 (Debian 7.1.0-9) version 7.1.0 (x86_64-linux-gnu) compiled by GNU C version 7.1.0, GMP version 6.1.2, MPFR version 3.1.5, MPC version 1.0.3, isl version isl-0.18-GMP GGC heuristics: --param ggc-min-expand=100 --param ggc-min-heapsize=131072 ignoring duplicate directory "/usr/include/x86_64-linux-gnu/c++/7" ignoring nonexistent directory "/usr/local/include/x86_64-linux-gnu" ignoring nonexistent directory "/usr/lib/gcc/x86_64-linux-gnu/7/../../../../x86_64-linux-gnu/include" #include "..." search starts here: #include <...> search starts here: /usr/include/c++/7 /usr/include/x86_64-linux-gnu/c++/7 /usr/include/c++/7/backward /usr/lib/gcc/x86_64-linux-gnu/7/include /usr/local/include /usr/lib/gcc/x86_64-linux-gnu/7/include-fixed /usr/include/x86_64-linux-gnu /usr/include End of search list. GNU C++14 (Debian 7.1.0-9) version 7.1.0 (x86_64-linux-gnu) compiled by GNU C version 7.1.0, GMP version 6.1.2, MPFR version 3.1.5, MPC version 1.0.3, isl version isl-0.18-GMP GGC heuristics: --param ggc-min-expand=100 --param ggc-min-heapsize=131072 Compiler executable checksum: 3681302eda59faba4e53a905eca4bf72 COLLECT_GCC_OPTIONS='-std=c++14' '-v' '-pthread' '-fPIC' '-shared' '-O2' '-o' 'gcc.so' '-shared-libgcc' '-mtune=generic' '-march=x86-64' as -v --64 -o /tmp/ccI2B3TO.o /tmp/ccdUrCDS.s GNU assembler version 2.28 (x86_64-linux-gnu) using BFD version (GNU Binutils for Debian) 2.28 COMPILER_PATH=/usr/lib/gcc/x86_64-linux-gnu/7/:/usr/lib/gcc/x86_64-linux-gnu/7/:/usr/lib/gcc/x86_64-linux-gnu/:/usr/lib/gcc/x86_64-linux-gnu/7/:/usr/lib/gcc/x86_64-linux-gnu/ LIBRARY_PATH=/usr/lib/gcc/x86_64-linux-gnu/7/:/usr/lib/gcc/x86_64-linux-gnu/7/../../../x86_64-linux-gnu/:/usr/lib/gcc/x86_64-linux-gnu/7/../../../../lib/:/lib/x86_64-linux-gnu/:/lib/../lib/:/usr/lib/x86_64-linux-gnu/:/usr/lib/../lib/:/usr/lib/gcc/x86_64-linux-gnu/7/../../../:/lib/:/usr/lib/ COLLECT_GCC_OPTIONS='-std=c++14' '-v' '-pthread' '-fPIC' '-shared' '-O2' '-o' 'gcc.so' '-shared-libgcc' '-mtune=generic' '-march=x86-64' /usr/lib/gcc/x86_64-linux-gnu/7/collect2 -plugin /usr/lib/gcc/x86_64-linux-gnu/7/liblto_plugin.so -plugin-opt=/usr/lib/gcc/x86_64-linux-gnu/7/lto-wrapper -plugin-opt=-fresolution=/tmp/cc9S0zbL.res -plugin-opt=-pass-through=-lgcc_s -plugin-opt=-pass-through=-lpthread -plugin-opt=-pass-through=-lc -plugin-opt=-pass-through=-lgcc_s --sysroot=/ --build-id --eh-frame-hdr -m elf_x86_64 --hash-style=gnu -shared -o gcc.so /usr/lib/gcc/x86_64-linux-gnu/7/../../../x86_64-linux-gnu/crti.o /usr/lib/gcc/x86_64-linux-gnu/7/crtbeginS.o -L/usr/lib/gcc/x86_64-linux-gnu/7 -L/usr/lib/gcc/x86_64-linux-gnu/7/../../../x86_64-linux-gnu -L/usr/lib/gcc/x86_64-linux-gnu/7/../../../../lib -L/lib/x86_64-linux-gnu -L/lib/../lib -L/usr/lib/x86_64-linux-gnu -L/usr/lib/../lib -L/usr/lib/gcc/x86_64-linux-gnu/7/../../.. /tmp/ccI2B3TO.o -lstdc++ -lm -lgcc_s -lpthread -lc -lgcc_s /usr/lib/gcc/x86_64-linux-gnu/7/crtendS.o /usr/lib/gcc/x86_64-linux-gnu/7/../../../x86_64-linux-gnu/crtn.o COLLECT_GCC_OPTIONS='-std=c++14' '-v' '-pthread' '-fPIC' '-shared' '-O2' '-o' 'gcc.so' '-shared-libgcc' '-mtune=generic' '-march=x86-64' + gdb -q -ex disassemble test -ex quit gcc.so Reading symbols from gcc.so...(no debugging symbols found)...done. Dump of assembler code for function _Z4testv: 0x00000000000007f0 <+0>: push %rbx 0x00000000000007f1 <+1>: sub $0x10,%rsp 0x00000000000007f5 <+5>: lea 0x2007cc(%rip),%rdi # 0x200fc8 0x00000000000007fc <+12>: callq 0x6e0 <__tls_get_addr@plt> 0x0000000000000801 <+17>: cmpb $0x0,0x0(%rax) 0x0000000000000808 <+24>: jne 0x840 <_Z4testv+80> 0x000000000000080a <+26>: lea 0x8(%rax),%rbx 0x0000000000000811 <+33>: mov %rax,0x8(%rsp) 0x0000000000000816 <+38>: mov %rbx,%rdi 0x0000000000000819 <+41>: callq 0x6d0 <_ZN3fooC1Ev@plt> 0x000000000000081e <+46>: mov 0x8(%rsp),%rax 0x0000000000000823 <+51>: mov 0x2007b6(%rip),%rdi # 0x200fe0 0x000000000000082a <+58>: lea 0x2007ff(%rip),%rdx # 0x201030 0x0000000000000831 <+65>: mov %rbx,%rsi 0x0000000000000834 <+68>: movb $0x1,0x0(%rax) 0x000000000000083b <+75>: callq 0x6f0 <__cxa_thread_atexit@plt> 0x0000000000000840 <+80>: lea 0x200781(%rip),%rdi # 0x200fc8 0x0000000000000847 <+87>: callq 0x6e0 <__tls_get_addr@plt> 0x000000000000084c <+92>: add $0x10,%rsp 0x0000000000000850 <+96>: add $0x8,%rax 0x0000000000000856 <+102>: pop %rbx 0x0000000000000857 <+103>: retq End of assembler dump. As you can see after the first call to __tls_get_addr() jne jumps to a second call to __tls_get_addr(). It should really only need to get the address once here, like clang does: + clang++ -std=c++14 -pthread -fPIC -shared -O2 -o clang.so t.cc + gdb -q -ex disassemble test -ex quit clang.so Reading symbols from clang.so...(no debugging symbols found)...done. Dump of assembler code for function _Z4testv: 0x00000000000007a0 <+0>: push %r14 0x00000000000007a2 <+2>: push %rbx 0x00000000000007a3 <+3>: push %rax 0x00000000000007a4 <+4>: lea 0x20081d(%rip),%rdi # 0x200fc8 0x00000000000007ab <+11>: callq 0x690 <__tls_get_addr@plt> 0x00000000000007b0 <+16>: mov %rax,%rbx 0x00000000000007b3 <+19>: mov 0x1(%rax),%al 0x00000000000007b9 <+25>: and $0x1,%al 0x00000000000007bb <+27>: jne 0x7ef <_Z4testv+79> 0x00000000000007bd <+29>: mov %rbx,%rax 0x00000000000007c0 <+32>: lea 0x0(%rax),%r14 0x00000000000007c7 <+39>: mov %r14,%rdi 0x00000000000007ca <+42>: callq 0x680 <_ZN3fooC1Ev@plt> 0x00000000000007cf <+47>: mov 0x20080a(%rip),%rdi # 0x200fe0 0x00000000000007d6 <+54>: lea 0x200853(%rip),%rdx # 0x201030 0x00000000000007dd <+61>: mov %r14,%rsi 0x00000000000007e0 <+64>: callq 0x6a0 <__cxa_thread_atexit@plt> 0x00000000000007e5 <+69>: mov %rbx,%rax 0x00000000000007e8 <+72>: movb $0x1,0x1(%rax) 0x00000000000007ef <+79>: mov %rbx,%rax 0x00000000000007f2 <+82>: lea 0x0(%rax),%rax 0x00000000000007f9 <+89>: add $0x8,%rsp 0x00000000000007fd <+93>: pop %rbx 0x00000000000007fe <+94>: pop %r14 0x0000000000000800 <+96>: retq End of assembler dump. This has some performance overhead which I'd like to avoid.