bits/atomicity.h has volatile qualifiers on the _Atomic_word* arguments to the __*_single and __*_dispatch variants of the atomic operations. This huts especially the single-threaded optimization variants which are usually inlined. Removing those qualifiers allows to reduce code size significantly as can be seen in the following simple testcase
#include <string> #include <cstring> char *foo (const char *s, const char *s2) { std::string ss(s); std::string ss2(s2); return strdup((ss + ss2).c_str()); } which shrinks from 408 bytes to 388 bytes text size on x86. Changes are CSE in the fast path which changes from <L14>:; __mem = (volatile _Atomic_word *) &this->D.15413._M_refcount; if (__gthrw_pthread_cancel != 0B) goto <L17>; else goto <L18>; <L17>:; __result = __exchange_and_add (__mem, -1); goto <bb 12> (<L19>); <L18>:; __result = *__mem; D.16636 = *__mem; D.16637 = D.16636 + -1; *__mem = D.16637; <L19>:; if (__result <= 0) goto <L15>; else goto <L30>; to <L14>:; if (__gthrw_pthread_cancel != 0B) goto <L17>; else goto <L18>; <L17>:; __result = __exchange_and_add (&this->D.15413._M_refcount, -1); goto <bb 12> (<L19>); <L18>:; __result = this->D.15413._M_refcount; this->D.15413._M_refcount = __result + -1; <L19>:; if (__result <= 0) goto <L15>; else goto <L30>; as the _M_refcount member of basic_string is not volatile. FYI, Richard.