We've already beaten this topic to death, so let's put a final nail in
the coffin:
__to_chars_10_impl is quite fast. According to the IACA the main loop
takes only 6.0 cycles, the whole function with one iteration takes
10.0 cycles. Replacing the __first[pos] and __first[pos - 1] with
__first[0] and __first[1] drops the function time to 7.53 cycles.
Changelog:
2019-09-08 Antony Polukhin <[email protected]>
* include/bits/charconv.h (__detail::__to_chars_10_impl): Replace
final offsets with constants.
And that's the only optimization that improves all the usecases and
reduces code size on 3 instructions.
Different approaches for optimizing the loop were showing different
results depending on the workload. The most interesting result gives
the compressed table of binary coded decimals:
static constexpr unsigned char __binary_coded_decimals[50] =
{ 0x00, 0x02, 0x04, 0x06, 0x08, 0x10... 0x98 };
unsigned __pos = __len - 1;
while (__val >= 100)
{
auto const addition = __val & 1;
auto const __num = (__val % 100) >> 1;
__val /= 100;
auto const __bcd = __binary_coded_decimals[__num];
__first[__pos] = '0' + (__bcd & 0xf) + addition;
__first[__pos - 1] = '0' + (__bcd >> 4);
__pos -= 2;
}
That approach shows the same results or even outperforms the existing
approach with __digits[201] = "0001020304..." in case of cold cache.
It also produces slightly smaller binaries. Unfortunately on a warmed
up cache it's slower than the existing approach. I don't think that
it's a worth change.
Attaching some of the benchmarks as a separate file (not for merge,
just something to experiment with).
--
Best regards,
Antony Polukhin
diff --git a/libstdc++-v3/include/bits/charconv.h
b/libstdc++-v3/include/bits/charconv.h
index 0911660..a5b6be5 100644
--- a/libstdc++-v3/include/bits/charconv.h
+++ b/libstdc++-v3/include/bits/charconv.h
@@ -92,11 +92,11 @@ namespace __detail
if (__val >= 10)
{
auto const __num = __val * 2;
- __first[__pos] = __digits[__num + 1];
- __first[__pos - 1] = __digits[__num];
+ __first[1] = __digits[__num + 1];
+ __first[0] = __digits[__num];
}
else
- __first[__pos] = '0' + __val;
+ __first[0] = '0' + __val;
}
} // namespace __detail
#include <type_traits>
#include <vector>
#include <benchmark/benchmark.h>
// Adjust those variables to test on different workloads
constexpr unsigned From = 130000;
constexpr unsigned To = 130004;
constexpr unsigned Len = 6;
// Bytes of CPU cache to reset
#define RANGES ->Args({16 * 1024 * 1024})
namespace std { namespace __detail {
template<typename _Tp>
int
__to_chars_10_table_2_impl(char* __first, unsigned __len, _Tp __val) noexcept
{
static_assert(is_integral<_Tp>::value, "implementation bug");
static_assert(is_unsigned<_Tp>::value, "implementation bug");
static constexpr char __digits[201] =
"0001020304050607080910111213141516171819"
"2021222324252627282930313233343536373839"
"4041424344454647484950515253545556575859"
"6061626364656667686970717273747576777879"
"8081828384858687888990919293949596979899";
unsigned __pos = __len - 1;
while (__val >= 100)
{
auto const __num = (__val % 100) * 2;
__val /= 100;
__first[__pos] = __digits[__num + 1];
__first[__pos - 1] = __digits[__num];
__pos -= 2;
}
if (__val >= 10)
{
auto const __num = __val * 2;
__first[1] = __digits[__num + 1];
__first[0] = __digits[__num];
}
else
__first[0] = '0' + __val;
return 0;
}
template<typename _Tp>
int
__to_chars_10_bcd_impl(char* __first, unsigned __len, _Tp __val) noexcept
{
static_assert(is_integral<_Tp>::value, "implementation bug");
static_assert(is_unsigned<_Tp>::value, "implementation bug");
static constexpr unsigned char __binary_coded_decimals[100] =
{ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x10
, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x20
, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x30
, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x40
, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x50
, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x60
, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x70
, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x80
, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x90
, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99};
unsigned __pos = __len - 1;
while (__val >= 100)
{
auto const __num = __val % 100;
__val /= 100;
auto const __bcd = __binary_coded_decimals[__num];
__first[__pos] = '0' + (__bcd & 0xf);
__first[__pos - 1] = '0' + (__bcd >> 4);
__pos -= 2;
}
if (__val >= 10)
{
auto const __bcd = __binary_coded_decimals[__val];
__first[1] = '0' + (__bcd & 0xf);
__first[0] = '0' + (__bcd >> 4);
}
else
__first[0] = '0' + __val;
return 0;
}
template<typename _Tp>
int
__to_chars_10_bcd_compressed_impl(char* __first, unsigned __len, _Tp __val) noexcept
{
static_assert(is_integral<_Tp>::value, "implementation bug");
static_assert(is_unsigned<_Tp>::value, "implementation bug");
static constexpr unsigned char __binary_coded_decimals[50] =
{ 0x00, 0x02, 0x04, 0x06, 0x08, 0x10
, 0x12, 0x14, 0x16, 0x18, 0x20
, 0x22, 0x24, 0x26, 0x28, 0x30
, 0x32, 0x34, 0x36, 0x38, 0x40
, 0x42, 0x44, 0x46, 0x48, 0x50
, 0x52, 0x54, 0x56, 0x58, 0x60
, 0x62, 0x64, 0x66, 0x68, 0x70
, 0x72, 0x74, 0x76, 0x78, 0x80
, 0x82, 0x84, 0x86, 0x88, 0x90
, 0x92, 0x94, 0x96, 0x98 };
unsigned __pos = __len - 1;
while (__val >= 100)
{
auto const addition = __val & 1;
auto const __num = (__val % 100) >> 1;
__val /= 100;
auto const __bcd = __binary_coded_decimals[__num];
__first[__pos] = '0' + (__bcd & 0xf) + addition;
__first[__pos - 1] = '0' + (__bcd >> 4);
__pos -= 2;
}
if (__val >= 10)
{
auto const __bcd = __binary_coded_decimals[__val >> 1];
__first[1] = '0' + (__bcd & 0xf) + (__val & 1);
__first[0] = '0' + (__bcd >> 4);
}
else
__first[0] = '0' + __val;
return 0;
}
template<typename _Tp>
_GLIBCXX14_CONSTEXPR int
__to_chars_10_naive_impl(char* __first, unsigned __len, _Tp __val) noexcept
{
static_assert(is_integral<_Tp>::value, "implementation bug");
static_assert(is_unsigned<_Tp>::value, "implementation bug");
unsigned __pos = __len - 1;
while (__val >= 100)
{
auto __num = __val % 10;
__val /= 10;
__first[__pos] = '0' + __num;
__num = __val % 10;
__val /= 10;
__first[__pos - 1] = '0' + __num;
__pos -= 2;
}
if (__val >= 10)
{
auto const __num = __val % 10;
__val /= 10;
__first[1] = '0' + __num;
__first[0] = '0' + __val;
}
else
__first[0] = '0' + __val;
return 0;
}
template<typename _Tp>
_GLIBCXX14_CONSTEXPR int
__to_chars_10_naive_unr_impl(char* __first, unsigned __len, _Tp __val) noexcept
{
static_assert(is_integral<_Tp>::value, "implementation bug");
static_assert(is_unsigned<_Tp>::value, "implementation bug");
while (__len >= 6)
{
unsigned __pos = __len - 1;
auto __num = __val % 10;
__val /= 10;
__first[__pos] = '0' + __num;
__num = __val % 10;
__val /= 10;
__first[__pos - 1] = '0' + __num;
__num = __val % 10;
__val /= 10;
__first[__pos - 2] = '0' + __num;
__num = __val % 10;
__val /= 10;
__first[__pos - 3] = '0' + __num;
__num = __val % 10;
__val /= 10;
__first[__pos - 4] = '0' + __num;
__num = __val % 10;
__val /= 10;
__first[__pos - 5] = '0' + __num;
__len -= 6;
}
switch (__len) {
case 5:
__first[4] = '0' + __val % 10;
__val /= 10;
case 4:
__first[3] = '0' + __val % 10;
__val /= 10;
case 3:
__first[2] = '0' + __val % 10;
__val /= 10;
case 2:
__first[1] = '0' + __val % 10;
__val /= 10;
case 1:
__first[0] = '0' + __val;
}
return 0;
}
}}
static void FlushCacheNonpausing(unsigned FlushCacheBaytes) {
std::vector<unsigned> resetter;
resetter.resize(FlushCacheBaytes / sizeof(unsigned), FlushCacheBaytes);
unsigned trash = static_cast<unsigned>(FlushCacheBaytes * FlushCacheBaytes);
for (unsigned& v : resetter) {
trash += v;
v += trash;
}
benchmark::DoNotOptimize(trash);
benchmark::DoNotOptimize(resetter);
}
static void naive(benchmark::State& state) {
const unsigned FlushCacheBaytes = state.range();
for (auto _ : state) {
if (FlushCacheBaytes) {
state.PauseTiming();
FlushCacheNonpausing(FlushCacheBaytes);
state.ResumeTiming();
}
for(unsigned i = From; i != To; ++i)
{
char buffer[Len];
benchmark::DoNotOptimize(std::__detail::__to_chars_10_naive_impl(buffer, Len, i));
benchmark::DoNotOptimize(buffer);
}
}
}
BENCHMARK(naive) RANGES;
static void unrolled(benchmark::State& state) {
const unsigned FlushCacheBaytes = state.range();
for (auto _ : state) {
if (FlushCacheBaytes) {
state.PauseTiming();
FlushCacheNonpausing(FlushCacheBaytes);
state.ResumeTiming();
}
for(unsigned i = From; i != To; ++i)
{
char buffer[Len];
benchmark::DoNotOptimize(std::__detail::__to_chars_10_naive_unr_impl(buffer, Len, i));
benchmark::DoNotOptimize(buffer);
}
}
}
BENCHMARK(unrolled) RANGES;
static void table_2(benchmark::State& state) {
const unsigned FlushCacheBaytes = state.range();
for (auto _ : state) {
if (FlushCacheBaytes) {
state.PauseTiming();
FlushCacheNonpausing(FlushCacheBaytes);
state.ResumeTiming();
}
for(unsigned i = From; i != To; ++i)
{
char buffer[Len];
benchmark::DoNotOptimize(std::__detail::__to_chars_10_table_2_impl(buffer, Len, i));
benchmark::DoNotOptimize(buffer);
}
}
}
BENCHMARK(table_2) RANGES;
static void bcd(benchmark::State& state) {
const unsigned FlushCacheBaytes = state.range();
for (auto _ : state) {
if (FlushCacheBaytes) {
state.PauseTiming();
FlushCacheNonpausing(FlushCacheBaytes);
state.ResumeTiming();
}
for(unsigned i = From; i != To; ++i)
{
char buffer[Len];
benchmark::DoNotOptimize(std::__detail::__to_chars_10_bcd_impl(buffer, Len, i));
benchmark::DoNotOptimize(buffer);
}
}
}
BENCHMARK(bcd) RANGES;
static void compressed_bin_coded(benchmark::State& state) {
const unsigned FlushCacheBaytes = state.range();
for (auto _ : state) {
if (FlushCacheBaytes) {
state.PauseTiming();
FlushCacheNonpausing(FlushCacheBaytes);
state.ResumeTiming();
}
for(unsigned i = From; i != To; ++i)
{
char buffer[Len];
benchmark::DoNotOptimize(std::__detail::__to_chars_10_bcd_compressed_impl(buffer, Len, i));
benchmark::DoNotOptimize(buffer);
}
}
}
BENCHMARK(compressed_bin_coded) RANGES;