This is an automated email from the ASF dual-hosted git repository.
yiguolei pushed a commit to branch branch-4.0
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-4.0 by this push:
new b4213638389 [improvement](cast) improve performance of casting string
to decimal and int (#60004) (#60159)
b4213638389 is described below
commit b4213638389d6ea35e74fddfea045105b60e960e
Author: TengJianPing <[email protected]>
AuthorDate: Wed Jan 28 12:28:47 2026 +0800
[improvement](cast) improve performance of casting string to decimal and
int (#60004) (#60159)
### What problem does this PR solve?
Issue Number: close #xxx
Pick #60004
Problem Summary:
### Release note
None
### Check List (For Author)
- Test <!-- At least one of them must be included. -->
- [ ] Regression test
- [ ] Unit Test
- [ ] Manual test (add detailed scripts or steps below)
- [ ] No need to test or manual test. Explain why:
- [ ] This is a refactor/code format and no logic has been changed.
- [ ] Previous test can cover this change.
- [ ] No code files have been changed.
- [ ] Other reason <!-- Add your reason? -->
- Behavior changed:
- [ ] No.
- [ ] Yes. <!-- Explain the behavior change -->
- Does this need documentation?
- [ ] No.
- [ ] Yes. <!-- Add document PR link here. eg:
https://github.com/apache/doris-website/pull/1214 -->
### Check List (For Reviewer who merge this PR)
- [ ] Confirm the release note
- [ ] Confirm test cases
- [ ] Confirm document
- [ ] Add branch pick label <!-- Add branch pick label that this PR
should merge into -->
---
be/src/olap/delete_handler.cpp | 2 +-
be/src/util/jsonb_document_cast.h | 6 +-
be/src/util/string_parser.cpp | 73 ++++---
be/src/util/string_parser.hpp | 47 ++++-
.../data_types/serde/data_type_number_serde.cpp | 12 +-
.../functions/cast/cast_to_basic_number_common.h | 12 +-
.../vec/function/cast/cast_to_decimal128_perf.cpp | 220 +++++++++++++++++++++
7 files changed, 320 insertions(+), 52 deletions(-)
diff --git a/be/src/olap/delete_handler.cpp b/be/src/olap/delete_handler.cpp
index 7c521aadfe1..3044b6e2d95 100644
--- a/be/src/olap/delete_handler.cpp
+++ b/be/src/olap/delete_handler.cpp
@@ -57,7 +57,7 @@ Status convert(const vectorized::DataTypePtr& data_type,
const std::string& str,
if constexpr (PType == TYPE_TINYINT || PType == TYPE_SMALLINT || PType ==
TYPE_INT ||
PType == TYPE_BIGINT || PType == TYPE_LARGEINT) {
vectorized::CastParameters parameters;
- if (!vectorized::CastToInt::from_string({str.data(), str.size()}, res,
parameters)) {
+ if (!vectorized::CastToInt::from_string<false>({str.data(),
str.size()}, res, parameters)) {
return Status::Error<ErrorCode::INVALID_ARGUMENT>(
"invalid {} string. str={}",
type_to_string(data_type->get_primitive_type()),
str);
diff --git a/be/src/util/jsonb_document_cast.h
b/be/src/util/jsonb_document_cast.h
index 8f44bf91f54..bd1acbcd335 100644
--- a/be/src/util/jsonb_document_cast.h
+++ b/be/src/util/jsonb_document_cast.h
@@ -174,7 +174,11 @@ struct JsonbCast {
case JsonbType::T_String: {
const auto* blob = jsonb_value->unpack<JsonbBinaryVal>();
StringRef str_ref {blob->getBlob(), blob->getBlobLen()};
- return CastToInt::from_string(str_ref, to, params);
+ return std::visit(
+ [&](auto is_strict_mode) {
+ return CastToInt::from_string<is_strict_mode>(str_ref,
to, params);
+ },
+ vectorized::make_bool_variant(params.is_strict));
}
default: {
return false;
diff --git a/be/src/util/string_parser.cpp b/be/src/util/string_parser.cpp
index dcdbda71edc..adc19dc8fb6 100644
--- a/be/src/util/string_parser.cpp
+++ b/be/src/util/string_parser.cpp
@@ -20,6 +20,7 @@
#include <limits>
#include "vec/core/extended_types.h"
+#include "vec/core/types.h"
namespace doris {
#include "common/compile_check_avoid_begin.h"
// Supported decimal number format:
@@ -78,17 +79,11 @@ typename PrimitiveTypeTraits<P>::CppType::NativeType
StringParser::string_to_dec
--len;
}
int int_part_count = 0;
- std::vector<unsigned char> digits;
- if (len > 0) {
- digits.resize(len);
- }
- int total_digit_count = 0;
int i = 0;
for (; i != len; ++i) {
const char& c = s[i];
if (LIKELY('0' <= c && c <= '9')) {
found_value = true;
- digits[total_digit_count++] = c - '0';
if (!found_dot) {
++int_part_count;
}
@@ -109,6 +104,7 @@ typename PrimitiveTypeTraits<P>::CppType::NativeType
StringParser::string_to_dec
}
// parse exponent if any
int64_t exponent = 0;
+ auto end_digit_index = i;
if (i != len) {
bool negative_exponent = false;
if (s[i] == 'e' || s[i] == 'E') {
@@ -160,51 +156,69 @@ typename PrimitiveTypeTraits<P>::CppType::NativeType
StringParser::string_to_dec
// whose max value is std::numeric_limits<int32_t>::max() - 4,
// so int_part_count will be in range of int32_t,
// and int_part_count + exponent will be in range of int64_t
- int64_t tmp_actual_int_part_count = int_part_count + exponent;
- if (tmp_actual_int_part_count > std::numeric_limits<int>::max() ||
- tmp_actual_int_part_count < std::numeric_limits<int>::min()) {
- *result = StringParser::PARSE_OVERFLOW;
+ int64_t tmp_result_int_part_digit_count = int_part_count + exponent;
+ if (tmp_result_int_part_digit_count > std::numeric_limits<int>::max() ||
+ tmp_result_int_part_digit_count < std::numeric_limits<int>::min()) {
+ *result = is_negative ? StringParser::PARSE_UNDERFLOW :
StringParser::PARSE_OVERFLOW;
return 0;
}
- int actual_int_part_count = tmp_actual_int_part_count;
+ int result_int_part_digit_count = tmp_result_int_part_digit_count;
int actual_frac_part_count = 0;
int digit_index = 0;
- if (actual_int_part_count >= 0) {
- int max_index = std::min(actual_int_part_count, total_digit_count);
+ if (result_int_part_digit_count >= 0) {
+ int max_index = std::min(found_dot ? (result_int_part_digit_count +
+ ((int_part_count > 0 && exponent
> 0) ? 1 : 0))
+ : result_int_part_digit_count,
+ end_digit_index);
+ max_index = (max_index == std::numeric_limits<int>::min() ?
end_digit_index : max_index);
// skip zero number
- for (; digit_index != max_index && digits[digit_index] == 0;
++digit_index) {
+ for (; digit_index != max_index && s[digit_index] == '0';
++digit_index) {
}
// test 0.00, .00, 0.{00...}e2147483647
// 0.00000e2147483647
- if (max_index - digit_index > type_precision - type_scale) {
+ if (digit_index != max_index &&
+ (result_int_part_digit_count - digit_index > type_precision -
type_scale)) {
*result = is_negative ? StringParser::PARSE_UNDERFLOW :
StringParser::PARSE_OVERFLOW;
return 0;
}
// get int part number
for (; digit_index != max_index; ++digit_index) {
- int_part_number = int_part_number * 10 + digits[digit_index];
+ if (UNLIKELY(s[digit_index] == '.')) {
+ continue;
+ }
+ int_part_number = int_part_number * 10 + (s[digit_index] - '0');
}
- if (digit_index != actual_int_part_count) {
- int_part_number *= get_scale_multiplier<T>(actual_int_part_count -
digit_index);
+ auto total_significant_digit_count = i - ((found_dot && int_part_count
> 0) ? 1 : 0);
+ if (result_int_part_digit_count > total_significant_digit_count) {
+ int_part_number *=
get_scale_multiplier<T>(result_int_part_digit_count -
+
total_significant_digit_count);
}
} else {
// leading zeros of fraction part
- actual_frac_part_count = -actual_int_part_count;
+ actual_frac_part_count = -result_int_part_digit_count;
}
// get fraction part number
- for (; digit_index != total_digit_count && actual_frac_part_count <
type_scale;
- ++digit_index, ++actual_frac_part_count) {
- frac_part_number = frac_part_number * 10 + digits[digit_index];
+ for (; digit_index != end_digit_index && actual_frac_part_count <
type_scale; ++digit_index) {
+ if (UNLIKELY(s[digit_index] == '.')) {
+ continue;
+ }
+ frac_part_number = frac_part_number * 10 + (s[digit_index] - '0');
+ ++actual_frac_part_count;
}
auto type_scale_multiplier = get_scale_multiplier<T>(type_scale);
// there are still extra fraction digits left, check rounding
- if (digit_index != total_digit_count) {
- // example: test 1.5 -> decimal(1, 0)
- if (digits[digit_index] >= 5) {
- ++frac_part_number;
- if (frac_part_number == type_scale_multiplier) {
- frac_part_number = 0;
- ++int_part_number;
+ if (digit_index != end_digit_index) {
+ if (UNLIKELY(s[digit_index] == '.')) {
+ ++digit_index;
+ }
+ if (digit_index != end_digit_index) {
+ // example: test 1.5 -> decimal(1, 0)
+ if (s[digit_index] >= '5') {
+ ++frac_part_number;
+ if (frac_part_number == type_scale_multiplier) {
+ frac_part_number = 0;
+ ++int_part_number;
+ }
}
}
} else {
@@ -221,6 +235,7 @@ typename PrimitiveTypeTraits<P>::CppType::NativeType
StringParser::string_to_dec
*result = StringParser::PARSE_SUCCESS;
return is_negative ? T(-value) : T(value);
}
+
template vectorized::Int32
StringParser::string_to_decimal<PrimitiveType::TYPE_DECIMAL32>(
const char* __restrict s, size_t len, int type_precision, int
type_scale,
ParseResult* result);
diff --git a/be/src/util/string_parser.hpp b/be/src/util/string_parser.hpp
index 88e3ff031c0..65fdd201a77 100644
--- a/be/src/util/string_parser.hpp
+++ b/be/src/util/string_parser.hpp
@@ -97,6 +97,29 @@ inline const char* skip_ascii_whitespaces(const char* s, T&
len) {
return s;
}
+template <typename T>
+inline const char* skip_leading_whitespace(const char* __restrict s, T& len) {
+ while (len > 0 && is_whitespace_ascii(*s)) {
+ ++s;
+ --len;
+ }
+
+ return s;
+}
+
+// skip trailing ascii whitespaces,
+// return the pointer to the first char,
+// and update the len to the new length, which does not include
+// trailing whitespaces
+template <typename T>
+inline const char* skip_trailing_whitespaces(const char* s, T& len) {
+ while (len > 0 && is_whitespace_ascii(s[len - 1])) {
+ --len;
+ }
+
+ return s;
+}
+
template <bool (*Pred)(char)>
bool range_suite(const char* s, const char* end) {
return std::ranges::all_of(s, end, Pred);
@@ -308,7 +331,11 @@ public:
// Assumes s represents a decimal number.
template <typename T, bool enable_strict_mode = false>
static inline T string_to_int(const char* __restrict s, size_t len,
ParseResult* result) {
- s = skip_ascii_whitespaces(s, len);
+ T ans = string_to_int_internal<T, enable_strict_mode>(s, len, result);
+ if (LIKELY(*result == PARSE_SUCCESS)) {
+ return ans;
+ }
+ s = skip_leading_whitespace(s, len);
return string_to_int_internal<T, enable_strict_mode>(s, len, result);
}
@@ -502,8 +529,13 @@ T StringParser::string_to_int_internal(const char*
__restrict s, int len, ParseR
return 0;
}
} else {
- if ((UNLIKELY(i == first || (!is_all_whitespace(s + i, len -
i) &&
- !is_float_suffix(s + i, len -
i))))) {
+ // Save original position where non-digit was found
+ int remaining_len = len - i;
+ const char* remaining_s = s + i;
+ // Skip trailing whitespaces from the remaining portion
+ remaining_s = skip_trailing_whitespaces(remaining_s,
remaining_len);
+ if ((UNLIKELY(i == first || (remaining_len != 0 &&
+ !is_float_suffix(remaining_s,
remaining_len))))) {
// Reject the string because either the first char was not
a digit,
// or the remaining chars are not all whitespace
*result = PARSE_FAILURE;
@@ -652,8 +684,13 @@ T StringParser::string_to_int_no_overflow(const char*
__restrict s, int len, Par
return 0;
}
} else {
- if ((UNLIKELY(!is_all_whitespace(s + i, len - i) &&
- !is_float_suffix(s + i, len - i)))) {
+ // Save original position where non-digit was found
+ int remaining_len = len - i;
+ const char* remaining_s = s + i;
+ // Skip trailing whitespaces from the remaining portion
+ remaining_s = skip_trailing_whitespaces(remaining_s,
remaining_len);
+ if ((UNLIKELY(remaining_len != 0 &&
+ !is_float_suffix(remaining_s, remaining_len)))) {
*result = PARSE_FAILURE;
return 0;
}
diff --git a/be/src/vec/data_types/serde/data_type_number_serde.cpp
b/be/src/vec/data_types/serde/data_type_number_serde.cpp
index 454321ff433..1ad1b7fb523 100644
--- a/be/src/vec/data_types/serde/data_type_number_serde.cpp
+++ b/be/src/vec/data_types/serde/data_type_number_serde.cpp
@@ -709,7 +709,7 @@ void DataTypeNumberSerDe<T>::write_one_cell_to_jsonb(const
IColumn& column,
}
}
-template <PrimitiveType PT>
+template <PrimitiveType PT, bool is_strict_mode>
bool try_parse_impl(typename PrimitiveTypeTraits<PT>::CppType& x, const
StringRef& str_ref,
CastParameters& params) {
if constexpr (is_float_or_double(PT)) {
@@ -717,7 +717,7 @@ bool try_parse_impl(typename
PrimitiveTypeTraits<PT>::CppType& x, const StringRe
} else if constexpr (PT == TYPE_BOOLEAN) {
return CastToBool::from_string(str_ref, x, params);
} else if constexpr (is_int(PT)) {
- return CastToInt::from_string(str_ref, x, params);
+ return CastToInt::from_string<is_strict_mode>(str_ref, x, params);
} else {
throw doris::Exception(ErrorCode::NOT_IMPLEMENTED_ERROR,
"try_parse_impl not implemented for type: {}",
type_to_string(PT));
@@ -731,7 +731,7 @@ Status DataTypeNumberSerDe<T>::from_string(StringRef& str,
IColumn& column,
typename PrimitiveTypeTraits<T>::CppType val;
CastParameters params;
params.is_strict = false;
- if (!try_parse_impl<T>(val, str, params)) {
+ if (!try_parse_impl<T, false>(val, str, params)) {
return Status::InvalidArgument("parse number fail, string: '{}'",
str.to_string());
}
column_data.insert_value(val);
@@ -745,7 +745,7 @@ Status
DataTypeNumberSerDe<T>::from_string_strict_mode(StringRef& str, IColumn&
typename PrimitiveTypeTraits<T>::CppType val;
CastParameters params;
params.is_strict = true;
- if (!try_parse_impl<T>(val, str, params)) {
+ if (!try_parse_impl<T, true>(val, str, params)) {
return Status::InvalidArgument("parse number fail, string: '{}'",
str.to_string());
}
column_data.insert_value(val);
@@ -773,7 +773,7 @@ Status DataTypeNumberSerDe<T>::from_string_batch(const
ColumnString& str, Column
size_t string_size = next_offset - current_offset;
StringRef str_ref(&(*chars)[current_offset], string_size);
- null_map[i] = !try_parse_impl<T>(vec_to[i], str_ref, params);
+ null_map[i] = !try_parse_impl<T, false>(vec_to[i], str_ref, params);
current_offset = next_offset;
}
return Status::OK();
@@ -802,7 +802,7 @@ Status
DataTypeNumberSerDe<T>::from_string_strict_mode_batch(
size_t string_size = next_offset - current_offset;
StringRef str_ref(&(*chars)[current_offset], string_size);
- if (!try_parse_impl<T>(vec_to[i], str_ref, params)) {
+ if (!try_parse_impl<T, true>(vec_to[i], str_ref, params)) {
return Status::InvalidArgument(
"parse number fail, string: '{}'",
std::string((char*)&(*chars)[current_offset],
string_size));
diff --git a/be/src/vec/functions/cast/cast_to_basic_number_common.h
b/be/src/vec/functions/cast/cast_to_basic_number_common.h
index bc91cf4d589..90f1c087b6e 100644
--- a/be/src/vec/functions/cast/cast_to_basic_number_common.h
+++ b/be/src/vec/functions/cast/cast_to_basic_number_common.h
@@ -132,18 +132,10 @@ constexpr bool IsCppTypeDateTime =
std::is_same_v<CppT, PrimitiveTypeTraits<TYPE_DATETIME>::CppType> ||
std::is_same_v<CppT, PrimitiveTypeTraits<TYPE_DATETIMEV2>::CppType>;
struct CastToInt {
- template <typename ToCppT>
+ template <bool is_strict_mode, typename ToCppT>
requires(IsCppTypeInt<ToCppT>)
static inline bool from_string(const StringRef& from, ToCppT& to,
CastParameters& params) {
- return std::visit(
- [&](auto is_strict_mode) {
- if constexpr (is_strict_mode) {
- return try_read_int_text<ToCppT, true>(to, from);
- } else {
- return try_read_int_text<ToCppT, false>(to, from);
- }
- },
- vectorized::make_bool_variant(params.is_strict));
+ return try_read_int_text<ToCppT, is_strict_mode>(to, from);
}
template <typename FromCppT, typename ToCppT>
diff --git a/be/test/vec/function/cast/cast_to_decimal128_perf.cpp
b/be/test/vec/function/cast/cast_to_decimal128_perf.cpp
index e56d845c38b..f381b44365a 100644
--- a/be/test/vec/function/cast/cast_to_decimal128_perf.cpp
+++ b/be/test/vec/function/cast/cast_to_decimal128_perf.cpp
@@ -27,6 +27,7 @@
#include "vec/columns/column_vector.h"
#include "vec/columns/common_column_test.h"
#include "vec/core/field.h"
+#include "vec/json/json_parser.h"
namespace doris::vectorized {
struct FunctionCastToDecimalPerfTest : public FunctionCastTest {
@@ -76,6 +77,108 @@ struct FunctionCastToDecimalPerfTest : public
FunctionCastTest {
type_to_string(ToPT),
PrettyPrinter::print(duration_ns,
TUnit::TIME_NS));
}
+ template <PrimitiveType ToPT>
+ int64_t perf_test_string_to_decimal(const MutableColumnPtr& column_orig,
size_t orig_line_count,
+ int to_precision, int to_scale, bool
nullable) {
+ // Create string column and load data from file
+ auto dt_from = std::make_shared<DataTypeString>();
+ // Replicate data 10 times for better performance measurement
+ auto column_from = dt_from->create_column();
+ for (int i = 0; i < 10; ++i) {
+ column_from->insert_range_from(*column_orig, 0,
column_orig->size());
+ }
+ EXPECT_EQ(column_from->size(), orig_line_count * 10);
+ std::cout << "column_from size: " << column_from->size() << std::endl;
+
+ // Create target decimal type
+ DataTypePtr dt_to = DataTypeFactory::instance().create_data_type(ToPT,
nullable,
+
to_precision, to_scale);
+
+ // Setup cast context
+ bool is_strict_mode = false;
+ auto ctx = create_context(is_strict_mode);
+ auto fn = get_cast_wrapper(ctx.get(), dt_from, dt_to);
+
+ Block block = {
+ {std::move(column_from), dt_from, "from"},
+ {nullptr, dt_to, "to"},
+ };
+
+ // Measure performance
+ int64_t duration_ns = 0;
+ {
+ SCOPED_RAW_TIMER(&duration_ns);
+ EXPECT_TRUE(fn(ctx.get(), block, {0}, 1, block.rows(), nullptr));
+ }
+
+ auto result = block.get_by_position(1).column;
+ EXPECT_EQ(result->size(), orig_line_count * 10);
+
+ // Display results
+ // std::cout << block.dump_data(0, 10) << "\n";
+ std::cout << fmt::format("cast STRING to {} (precision={}, scale={})
time: {}\n",
+ type_to_string(ToPT), to_precision, to_scale,
+ PrettyPrinter::print(duration_ns,
TUnit::TIME_NS));
+
+ // Calculate throughput
+ double rows_per_second = (orig_line_count * 10.0) / (duration_ns /
1e9);
+ std::cout << fmt::format("Throughput: {:.2f} million rows/second\n",
rows_per_second / 1e6);
+ return duration_ns;
+ }
+
+ // Variant with strict mode support
+ template <PrimitiveType ToPT>
+ void perf_test_string_to_decimal_strict(const std::string& test_data_file,
+ size_t orig_line_count, int
to_precision, int to_scale,
+ bool nullable, bool strict_mode) {
+ auto dt_from = std::make_shared<DataTypeString>();
+ auto column_orig = dt_from->create_column();
+
+ {
+ MutableColumns columns;
+ columns.push_back(column_orig->get_ptr());
+ DataTypeSerDeSPtrs serde = {dt_from->get_serde()};
+ load_columns_data_from_file(columns, serde, ';', {0},
test_data_file);
+ EXPECT_EQ(column_orig->size(), orig_line_count);
+ }
+
+ auto column_from = dt_from->create_column();
+ for (int i = 0; i < 10; ++i) {
+ column_from->insert_range_from(*column_orig, 0,
column_orig->size());
+ }
+ EXPECT_EQ(column_from->size(), orig_line_count * 10);
+ std::cout << "column_from size: " << column_from->size() << std::endl;
+
+ DataTypePtr dt_to = DataTypeFactory::instance().create_data_type(ToPT,
nullable,
+
to_precision, to_scale);
+
+ auto ctx = create_context(strict_mode);
+ auto fn = get_cast_wrapper(ctx.get(), dt_from, dt_to);
+ ASSERT_TRUE(fn != nullptr);
+
+ Block block = {
+ {std::move(column_from), dt_from, "from"},
+ {nullptr, dt_to, "to"},
+ };
+
+ int64_t duration_ns = 0;
+ {
+ SCOPED_RAW_TIMER(&duration_ns);
+ EXPECT_TRUE(fn(ctx.get(), block, {0}, 1, block.rows(), nullptr));
+ }
+
+ auto result = block.get_by_position(1).column;
+ EXPECT_EQ(result->size(), orig_line_count * 10);
+
+ std::cout << block.dump_data(0, 10) << "\n";
+ std::cout << fmt::format(
+ "cast STRING to {} (precision={}, scale={}, strict_mode={})
time: {}\n",
+ type_to_string(ToPT), to_precision, to_scale, strict_mode,
+ PrettyPrinter::print(duration_ns, TUnit::TIME_NS));
+
+ double rows_per_second = (orig_line_count * 10.0) / (duration_ns /
1e9);
+ std::cout << fmt::format("Throughput: {:.2f} million rows/second\n",
rows_per_second / 1e6);
+ }
};
// Optimized version with reserve for better performance
@@ -100,6 +203,123 @@ void read_file_to_vector_optimized(const std::string&
filename, std::vector<std:
lines.push_back(std::move(line)); // Move instead of copy
}
}
+// Test cases
+/*
+TEST_F(FunctionCastToDecimalPerfTest, test_string_to_decimal128v3_perf) {
+ std::string test_data_file =
"/mnt/disk2/tengjianping/cast_perf/decimal_test_data_38_19.txt";
+ size_t orig_line_count = 10000000; // 10 million lines
+ // Create string column and load data from file
+ auto dt_from = std::make_shared<DataTypeString>();
+ auto column_orig = dt_from->create_column();
+
+ {
+ MutableColumns columns;
+ columns.push_back(column_orig->get_ptr());
+ DataTypeSerDeSPtrs serde = {dt_from->get_serde()};
+ load_columns_data_from_file(columns, serde, ';', {0}, test_data_file);
+ EXPECT_EQ(column_orig->size(), orig_line_count);
+ }
+
+ int test_rounds = 10;
+ int64_t total_duration_ns = 0;
+ for (int i = 0; i < test_rounds; ++i) {
+ std::cout << "\n--- Test Round " << (i + 1) << " ---\n";
+ total_duration_ns +=
perf_test_string_to_decimal<PrimitiveType::TYPE_DECIMAL128I>(
+ column_orig, orig_line_count, 38, 19, false);
+ }
+ auto avg_duration_ns = total_duration_ns / test_rounds;
+ std::cout << fmt::format("\nAverage time over {} rounds: {}\n",
test_rounds,
+ PrettyPrinter::print(avg_duration_ns,
TUnit::TIME_NS));
+}
+
+TEST_F(FunctionCastToDecimalPerfTest,
test_string_to_decimal128v3_nullable_perf) {
+ std::string test_data_file =
"/mnt/disk2/tengjianping/cast_perf/decimal_test_data_38_19.txt";
+ size_t orig_line_count = 10000000; // 10 million lines
+ // Create string column and load data from file
+ auto dt_from = std::make_shared<DataTypeString>();
+ auto column_orig = dt_from->create_column();
+
+ {
+ MutableColumns columns;
+ columns.push_back(column_orig->get_ptr());
+ DataTypeSerDeSPtrs serde = {dt_from->get_serde()};
+ load_columns_data_from_file(columns, serde, ';', {0}, test_data_file);
+ EXPECT_EQ(column_orig->size(), orig_line_count);
+ }
+
+ int test_rounds = 10;
+ int64_t total_duration_ns = 0;
+ for (int i = 0; i < test_rounds; ++i) {
+ std::cout << "\n--- Test Round " << (i + 1) << " ---\n";
+ total_duration_ns +=
perf_test_string_to_decimal<PrimitiveType::TYPE_DECIMAL128I>(
+ column_orig, orig_line_count, 38, 19, true);
+ }
+ auto avg_duration_ns = total_duration_ns / test_rounds;
+ std::cout << fmt::format("\nAverage time over {} rounds: {}\n",
test_rounds,
+ PrettyPrinter::print(avg_duration_ns,
TUnit::TIME_NS));
+}
+
+TEST_F(FunctionCastToDecimalPerfTest, test_string_to_decimal64v3_perf) {
+ std::string test_data_file =
"/mnt/disk2/tengjianping/cast_perf/decimal_test_data_18_9.txt";
+ size_t orig_line_count = 10000000;
+ perf_test_string_to_decimal<PrimitiveType::TYPE_DECIMAL64>(test_data_file,
orig_line_count, 18,
+ 9, false);
+}
+
+TEST_F(FunctionCastToDecimalPerfTest, test_string_to_decimal32v3_perf) {
+ std::string test_data_file =
"/mnt/disk2/tengjianping/cast_perf/decimal_test_data_9_4.txt";
+ size_t orig_line_count = 10000000;
+ perf_test_string_to_decimal<PrimitiveType::TYPE_DECIMAL32>(test_data_file,
orig_line_count, 9,
+ 4, false);
+}
+
+TEST_F(FunctionCastToDecimalPerfTest, test_string_to_decimalv2_perf) {
+ std::string test_data_file =
"/mnt/disk2/tengjianping/cast_perf/decimalv2_test_data_27_9.txt";
+ size_t orig_line_count = 10000000;
+ perf_test_string_to_decimal<PrimitiveType::TYPE_DECIMALV2>(test_data_file,
orig_line_count, 27,
+ 9, false);
+}
+
+TEST_F(FunctionCastToDecimalPerfTest,
test_string_to_decimal128v3_strict_mode_perf) {
+ std::string test_data_file =
"/mnt/disk2/tengjianping/cast_perf/decimal_test_data_38_19.txt";
+ size_t orig_line_count = 10000000;
+
+ std::cout << "\n=== Testing with strict_mode=false ===\n";
+ perf_test_string_to_decimal_strict<PrimitiveType::TYPE_DECIMAL128I>(
+ test_data_file, orig_line_count, 38, 19, false, false);
+
+ std::cout << "\n=== Testing with strict_mode=true ===\n";
+ perf_test_string_to_decimal_strict<PrimitiveType::TYPE_DECIMAL128I>(
+ test_data_file, orig_line_count, 38, 19, false, true);
+}
+
+// Comparison test: String vs Direct decimal parsing
+TEST_F(FunctionCastToDecimalPerfTest, test_string_parse_comparison) {
+ std::vector<std::string> file_contents;
+
read_file_to_vector_optimized("/mnt/disk2/tengjianping/cast_perf/decimal_test_data_38_19.txt",
+ file_contents);
+
+ size_t line_count = file_contents.size();
+ std::cout << "Read " << line_count << " lines" << std::endl;
+
+ // Test 1: Direct StringParser
+ int64_t duration_ns_parser = 0;
+ StringParser::ParseResult result;
+ {
+ SCOPED_RAW_TIMER(&duration_ns_parser);
+ for (size_t i = 0; i != line_count; ++i) {
+ auto decimal_value =
StringParser::string_to_decimal<PrimitiveType::TYPE_DECIMAL128I>(
+ file_contents[i].c_str(), file_contents[i].size(), 38, 19,
&result);
+ (void)decimal_value;
+ }
+ }
+ std::cout << "Direct StringParser time: "
+ << PrettyPrinter::print(duration_ns_parser, TUnit::TIME_NS) <<
"\n";
+
+ // Test 2: Cast function (already tested in other tests)
+ std::cout << "\nFor cast function performance, see
test_string_to_decimal128v3_perf\n";
+}
+ */
/*
TEST_F(FunctionCastToDecimalPerfTest, test_to_decimal128v3_from_string_perf) {
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]