We've already beaten this topic to death, so let's put a final nail in the coffin:
__to_chars_10_impl is quite fast. According to the IACA the main loop takes only 6.0 cycles, the whole function with one iteration takes 10.0 cycles. Replacing the __first[pos] and __first[pos - 1] with __first[0] and __first[1] drops the function time to 7.53 cycles. Changelog: 2019-09-08 Antony Polukhin <antosh...@gmail.com> * include/bits/charconv.h (__detail::__to_chars_10_impl): Replace final offsets with constants. And that's the only optimization that improves all the usecases and reduces code size on 3 instructions. Different approaches for optimizing the loop were showing different results depending on the workload. The most interesting result gives the compressed table of binary coded decimals: static constexpr unsigned char __binary_coded_decimals[50] = { 0x00, 0x02, 0x04, 0x06, 0x08, 0x10... 0x98 }; unsigned __pos = __len - 1; while (__val >= 100) { auto const addition = __val & 1; auto const __num = (__val % 100) >> 1; __val /= 100; auto const __bcd = __binary_coded_decimals[__num]; __first[__pos] = '0' + (__bcd & 0xf) + addition; __first[__pos - 1] = '0' + (__bcd >> 4); __pos -= 2; } That approach shows the same results or even outperforms the existing approach with __digits[201] = "0001020304..." in case of cold cache. It also produces slightly smaller binaries. Unfortunately on a warmed up cache it's slower than the existing approach. I don't think that it's a worth change. Attaching some of the benchmarks as a separate file (not for merge, just something to experiment with). -- Best regards, Antony Polukhin
diff --git a/libstdc++-v3/include/bits/charconv.h b/libstdc++-v3/include/bits/charconv.h index 0911660..a5b6be5 100644 --- a/libstdc++-v3/include/bits/charconv.h +++ b/libstdc++-v3/include/bits/charconv.h @@ -92,11 +92,11 @@ namespace __detail if (__val >= 10) { auto const __num = __val * 2; - __first[__pos] = __digits[__num + 1]; - __first[__pos - 1] = __digits[__num]; + __first[1] = __digits[__num + 1]; + __first[0] = __digits[__num]; } else - __first[__pos] = '0' + __val; + __first[0] = '0' + __val; } } // namespace __detail
#include <type_traits> #include <vector> #include <benchmark/benchmark.h> // Adjust those variables to test on different workloads constexpr unsigned From = 130000; constexpr unsigned To = 130004; constexpr unsigned Len = 6; // Bytes of CPU cache to reset #define RANGES ->Args({16 * 1024 * 1024}) namespace std { namespace __detail { template<typename _Tp> int __to_chars_10_table_2_impl(char* __first, unsigned __len, _Tp __val) noexcept { static_assert(is_integral<_Tp>::value, "implementation bug"); static_assert(is_unsigned<_Tp>::value, "implementation bug"); static constexpr char __digits[201] = "0001020304050607080910111213141516171819" "2021222324252627282930313233343536373839" "4041424344454647484950515253545556575859" "6061626364656667686970717273747576777879" "8081828384858687888990919293949596979899"; unsigned __pos = __len - 1; while (__val >= 100) { auto const __num = (__val % 100) * 2; __val /= 100; __first[__pos] = __digits[__num + 1]; __first[__pos - 1] = __digits[__num]; __pos -= 2; } if (__val >= 10) { auto const __num = __val * 2; __first[1] = __digits[__num + 1]; __first[0] = __digits[__num]; } else __first[0] = '0' + __val; return 0; } template<typename _Tp> int __to_chars_10_bcd_impl(char* __first, unsigned __len, _Tp __val) noexcept { static_assert(is_integral<_Tp>::value, "implementation bug"); static_assert(is_unsigned<_Tp>::value, "implementation bug"); static constexpr unsigned char __binary_coded_decimals[100] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x10 , 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x20 , 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x30 , 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x40 , 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x50 , 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x60 , 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x70 , 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x80 , 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x90 , 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99}; unsigned __pos = __len - 1; while (__val >= 100) { auto const __num = __val % 100; __val /= 100; auto const __bcd = __binary_coded_decimals[__num]; __first[__pos] = '0' + (__bcd & 0xf); __first[__pos - 1] = '0' + (__bcd >> 4); __pos -= 2; } if (__val >= 10) { auto const __bcd = __binary_coded_decimals[__val]; __first[1] = '0' + (__bcd & 0xf); __first[0] = '0' + (__bcd >> 4); } else __first[0] = '0' + __val; return 0; } template<typename _Tp> int __to_chars_10_bcd_compressed_impl(char* __first, unsigned __len, _Tp __val) noexcept { static_assert(is_integral<_Tp>::value, "implementation bug"); static_assert(is_unsigned<_Tp>::value, "implementation bug"); static constexpr unsigned char __binary_coded_decimals[50] = { 0x00, 0x02, 0x04, 0x06, 0x08, 0x10 , 0x12, 0x14, 0x16, 0x18, 0x20 , 0x22, 0x24, 0x26, 0x28, 0x30 , 0x32, 0x34, 0x36, 0x38, 0x40 , 0x42, 0x44, 0x46, 0x48, 0x50 , 0x52, 0x54, 0x56, 0x58, 0x60 , 0x62, 0x64, 0x66, 0x68, 0x70 , 0x72, 0x74, 0x76, 0x78, 0x80 , 0x82, 0x84, 0x86, 0x88, 0x90 , 0x92, 0x94, 0x96, 0x98 }; unsigned __pos = __len - 1; while (__val >= 100) { auto const addition = __val & 1; auto const __num = (__val % 100) >> 1; __val /= 100; auto const __bcd = __binary_coded_decimals[__num]; __first[__pos] = '0' + (__bcd & 0xf) + addition; __first[__pos - 1] = '0' + (__bcd >> 4); __pos -= 2; } if (__val >= 10) { auto const __bcd = __binary_coded_decimals[__val >> 1]; __first[1] = '0' + (__bcd & 0xf) + (__val & 1); __first[0] = '0' + (__bcd >> 4); } else __first[0] = '0' + __val; return 0; } template<typename _Tp> _GLIBCXX14_CONSTEXPR int __to_chars_10_naive_impl(char* __first, unsigned __len, _Tp __val) noexcept { static_assert(is_integral<_Tp>::value, "implementation bug"); static_assert(is_unsigned<_Tp>::value, "implementation bug"); unsigned __pos = __len - 1; while (__val >= 100) { auto __num = __val % 10; __val /= 10; __first[__pos] = '0' + __num; __num = __val % 10; __val /= 10; __first[__pos - 1] = '0' + __num; __pos -= 2; } if (__val >= 10) { auto const __num = __val % 10; __val /= 10; __first[1] = '0' + __num; __first[0] = '0' + __val; } else __first[0] = '0' + __val; return 0; } template<typename _Tp> _GLIBCXX14_CONSTEXPR int __to_chars_10_naive_unr_impl(char* __first, unsigned __len, _Tp __val) noexcept { static_assert(is_integral<_Tp>::value, "implementation bug"); static_assert(is_unsigned<_Tp>::value, "implementation bug"); while (__len >= 6) { unsigned __pos = __len - 1; auto __num = __val % 10; __val /= 10; __first[__pos] = '0' + __num; __num = __val % 10; __val /= 10; __first[__pos - 1] = '0' + __num; __num = __val % 10; __val /= 10; __first[__pos - 2] = '0' + __num; __num = __val % 10; __val /= 10; __first[__pos - 3] = '0' + __num; __num = __val % 10; __val /= 10; __first[__pos - 4] = '0' + __num; __num = __val % 10; __val /= 10; __first[__pos - 5] = '0' + __num; __len -= 6; } switch (__len) { case 5: __first[4] = '0' + __val % 10; __val /= 10; case 4: __first[3] = '0' + __val % 10; __val /= 10; case 3: __first[2] = '0' + __val % 10; __val /= 10; case 2: __first[1] = '0' + __val % 10; __val /= 10; case 1: __first[0] = '0' + __val; } return 0; } }} static void FlushCacheNonpausing(unsigned FlushCacheBaytes) { std::vector<unsigned> resetter; resetter.resize(FlushCacheBaytes / sizeof(unsigned), FlushCacheBaytes); unsigned trash = static_cast<unsigned>(FlushCacheBaytes * FlushCacheBaytes); for (unsigned& v : resetter) { trash += v; v += trash; } benchmark::DoNotOptimize(trash); benchmark::DoNotOptimize(resetter); } static void naive(benchmark::State& state) { const unsigned FlushCacheBaytes = state.range(); for (auto _ : state) { if (FlushCacheBaytes) { state.PauseTiming(); FlushCacheNonpausing(FlushCacheBaytes); state.ResumeTiming(); } for(unsigned i = From; i != To; ++i) { char buffer[Len]; benchmark::DoNotOptimize(std::__detail::__to_chars_10_naive_impl(buffer, Len, i)); benchmark::DoNotOptimize(buffer); } } } BENCHMARK(naive) RANGES; static void unrolled(benchmark::State& state) { const unsigned FlushCacheBaytes = state.range(); for (auto _ : state) { if (FlushCacheBaytes) { state.PauseTiming(); FlushCacheNonpausing(FlushCacheBaytes); state.ResumeTiming(); } for(unsigned i = From; i != To; ++i) { char buffer[Len]; benchmark::DoNotOptimize(std::__detail::__to_chars_10_naive_unr_impl(buffer, Len, i)); benchmark::DoNotOptimize(buffer); } } } BENCHMARK(unrolled) RANGES; static void table_2(benchmark::State& state) { const unsigned FlushCacheBaytes = state.range(); for (auto _ : state) { if (FlushCacheBaytes) { state.PauseTiming(); FlushCacheNonpausing(FlushCacheBaytes); state.ResumeTiming(); } for(unsigned i = From; i != To; ++i) { char buffer[Len]; benchmark::DoNotOptimize(std::__detail::__to_chars_10_table_2_impl(buffer, Len, i)); benchmark::DoNotOptimize(buffer); } } } BENCHMARK(table_2) RANGES; static void bcd(benchmark::State& state) { const unsigned FlushCacheBaytes = state.range(); for (auto _ : state) { if (FlushCacheBaytes) { state.PauseTiming(); FlushCacheNonpausing(FlushCacheBaytes); state.ResumeTiming(); } for(unsigned i = From; i != To; ++i) { char buffer[Len]; benchmark::DoNotOptimize(std::__detail::__to_chars_10_bcd_impl(buffer, Len, i)); benchmark::DoNotOptimize(buffer); } } } BENCHMARK(bcd) RANGES; static void compressed_bin_coded(benchmark::State& state) { const unsigned FlushCacheBaytes = state.range(); for (auto _ : state) { if (FlushCacheBaytes) { state.PauseTiming(); FlushCacheNonpausing(FlushCacheBaytes); state.ResumeTiming(); } for(unsigned i = From; i != To; ++i) { char buffer[Len]; benchmark::DoNotOptimize(std::__detail::__to_chars_10_bcd_compressed_impl(buffer, Len, i)); benchmark::DoNotOptimize(buffer); } } } BENCHMARK(compressed_bin_coded) RANGES;