This is an automated email from the ASF dual-hosted git repository. lihaopeng pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push: new 7c145faa80 [Enhance] use fast_float::from_chars to do str cast to float/double to avoid lose precision (#16190) 7c145faa80 is described below commit 7c145faa809f6ff70d895c91097ea7cc0edf8fc1 Author: HappenLee <happen...@hotmail.com> AuthorDate: Wed Feb 1 23:53:34 2023 +0800 [Enhance] use fast_float::from_chars to do str cast to float/double to avoid lose precision (#16190) --- be/src/util/string_parser.hpp | 146 ++++++++++-------------------------- be/test/util/string_parser_test.cpp | 5 +- 2 files changed, 41 insertions(+), 110 deletions(-) diff --git a/be/src/util/string_parser.hpp b/be/src/util/string_parser.hpp index 653f0dac14..02006b7c7d 100644 --- a/be/src/util/string_parser.hpp +++ b/be/src/util/string_parser.hpp @@ -20,6 +20,8 @@ #pragma once +#include <fast_float/fast_float.h> + #include <cmath> #include <cstdint> #include <cstring> @@ -111,13 +113,7 @@ public: template <typename T> static inline T string_to_float(const char* s, int len, ParseResult* result) { - T ans = string_to_float_internal<T>(s, len, result); - if (LIKELY(*result == PARSE_SUCCESS)) { - return ans; - } - - int i = skip_leading_whitespace(s, len); - return string_to_float_internal<T>(s + i, len - i, result); + return string_to_float_internal<T>(s, len, result); } // Parses a string for 'true' or 'false', case insensitive. @@ -425,118 +421,54 @@ inline T StringParser::string_to_int_no_overflow(const char* s, int len, ParseRe template <typename T> inline T StringParser::string_to_float_internal(const char* s, int len, ParseResult* result) { - if (UNLIKELY(len <= 0)) { + int i = 0; + // skip leading spaces + for (; i < len; ++i) { + if (!is_whitespace(s[i])) { + break; + } + } + + // skip back spaces + int j = len - 1; + for (; j >= i; j--) { + if (!is_whitespace(s[j])) { + break; + } + } + + // skip leading '+', from_chars can handle '-' + if (i < len && s[i] == '+') { + i++; + } + if (UNLIKELY(i > j)) { *result = PARSE_FAILURE; return 0; } // Use double here to not lose precision while accumulating the result double val = 0; - bool negative = false; - int i = 0; - double divide = 1; - bool decimal = false; - int64_t remainder = 0; - // The number of 'significant figures' we've encountered so far (i.e., digits excluding - // leading 0s). This technically shouldn't count trailing 0s either, but for us it - // doesn't matter if we count them based on the implementation below. - int sig_figs = 0; - - switch (*s) { - case '-': - negative = true; - case '+': - i = 1; - } - - int first = i; - for (; i < len; ++i) { - if (LIKELY(s[i] >= '0' && s[i] <= '9')) { - if (s[i] != '0' || sig_figs > 0) { - ++sig_figs; - } - if (decimal) { - // According to the IEEE floating-point spec, a double has up to 15-17 - // significant decimal digits (see - // http://en.wikipedia.org/wiki/Double-precision_floating-point_format). We stop - // processing digits after we've already seen at least 18 sig figs to avoid - // overflowing 'remainder' (we stop after 18 instead of 17 to get the rounding - // right). - if (sig_figs <= 18) { - remainder = remainder * 10 + s[i] - '0'; - divide *= 10; + auto res = fast_float::from_chars(s + i, s + j + 1, val); + + if (res.ec == std::errc() && res.ptr == s + j + 1) { + if (abs(val) == std::numeric_limits<T>::infinity()) { + auto contain_inf = false; + for (int k = i; k < j + 1; k++) { + if (s[k] == 'i' || s[k] == 'I') { + contain_inf = true; + break; } - } else { - val = val * 10 + s[i] - '0'; - } - } else if (s[i] == '.') { - decimal = true; - } else if (s[i] == 'e' || s[i] == 'E') { - break; - } else if (s[i] == 'i' || s[i] == 'I') { - if (len > i + 2 && (s[i + 1] == 'n' || s[i + 1] == 'N') && - (s[i + 2] == 'f' || s[i + 2] == 'F')) { - // Note: Hive writes inf as Infinity, at least for text. We'll be a little loose - // here and interpret any column with inf as a prefix as infinity rather than - // checking every remaining byte. - *result = PARSE_SUCCESS; - return negative ? -INFINITY : INFINITY; - } else { - // Starts with 'i', but isn't inf... - *result = PARSE_FAILURE; - return 0; - } - } else if (s[i] == 'n' || s[i] == 'N') { - if (len > i + 2 && (s[i + 1] == 'a' || s[i + 1] == 'A') && - (s[i + 2] == 'n' || s[i + 2] == 'N')) { - *result = PARSE_SUCCESS; - return negative ? -NAN : NAN; - } else { - // Starts with 'n', but isn't NaN... - *result = PARSE_FAILURE; - return 0; - } - } else { - if ((UNLIKELY(i == first || !is_all_whitespace(s + i, len - i)))) { - // Reject the string because either the first char was not a digit, "," or "e", - // or the remaining chars are not all whitespace - *result = PARSE_FAILURE; - return 0; } - // skip trailing whitespace. - break; - } - } - val += remainder / divide; - - if (i < len && (s[i] == 'e' || s[i] == 'E')) { - // Create a C-string from s starting after the optional '-' sign and fall back to - // strtod to avoid conversion inaccuracy for scientific notation. - // Do not use boost::lexical_cast because it causes codegen to crash for an - // unknown reason (exception handling?). - char c_str[len - negative + 1]; - memcpy(c_str, s + negative, len - negative); - c_str[len - negative] = '\0'; - char* s_end; - val = strtod(c_str, &s_end); - if (s_end != c_str + len - negative) { - // skip trailing whitespace - int trailing_len = len - negative - (int)(s_end - c_str); - if (UNLIKELY(!is_all_whitespace(s_end, trailing_len))) { - *result = PARSE_FAILURE; - return val; - } + *result = contain_inf ? PARSE_SUCCESS : PARSE_OVERFLOW; + } else { + *result = PARSE_SUCCESS; } - } - - // Determine if it is an overflow case and update the result - if (UNLIKELY(val == std::numeric_limits<T>::infinity())) { - *result = PARSE_OVERFLOW; + return val; } else { - *result = PARSE_SUCCESS; + *result = PARSE_FAILURE; } - return (T)(negative ? -val : val); + return 0; } inline bool StringParser::string_to_bool_internal(const char* s, int len, ParseResult* result) { diff --git a/be/test/util/string_parser_test.cpp b/be/test/util/string_parser_test.cpp index a85e5b97ed..f9e9c37050 100644 --- a/be/test/util/string_parser_test.cpp +++ b/be/test/util/string_parser_test.cpp @@ -441,7 +441,6 @@ TEST(StringToFloat, Basic) { std::string double_max = boost::lexical_cast<std::string>(std::numeric_limits<double>::max()); test_float_value<double>(double_min, StringParser::PARSE_SUCCESS); test_float_value<double>(double_max, StringParser::PARSE_SUCCESS); - // Non-finite values test_all_float_variants("INFinity", StringParser::PARSE_SUCCESS); test_all_float_variants("infinity", StringParser::PARSE_SUCCESS); @@ -451,8 +450,8 @@ TEST(StringToFloat, Basic) { test_float_value_is_nan<double>("nan", StringParser::PARSE_SUCCESS); test_float_value_is_nan<float>("NaN", StringParser::PARSE_SUCCESS); test_float_value_is_nan<double>("NaN", StringParser::PARSE_SUCCESS); - test_float_value_is_nan<float>("nana", StringParser::PARSE_SUCCESS); - test_float_value_is_nan<double>("nana", StringParser::PARSE_SUCCESS); + test_float_value_is_nan<float>("nana", StringParser::PARSE_FAILURE); + test_float_value_is_nan<double>("nana", StringParser::PARSE_FAILURE); test_float_value_is_nan<float>("naN", StringParser::PARSE_SUCCESS); test_float_value_is_nan<double>("naN", StringParser::PARSE_SUCCESS); --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org