This is an automated email from the ASF dual-hosted git repository.

yiguolei pushed a commit to branch branch-4.0
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/branch-4.0 by this push:
     new b4213638389 [improvement](cast) improve performance of casting string 
to decimal and int (#60004) (#60159)
b4213638389 is described below

commit b4213638389d6ea35e74fddfea045105b60e960e
Author: TengJianPing <[email protected]>
AuthorDate: Wed Jan 28 12:28:47 2026 +0800

    [improvement](cast) improve performance of casting string to decimal and 
int (#60004) (#60159)
    
    ### What problem does this PR solve?
    
    Issue Number: close #xxx
    
    Pick #60004
    
    Problem Summary:
    
    ### Release note
    
    None
    
    ### Check List (For Author)
    
    - Test <!-- At least one of them must be included. -->
        - [ ] Regression test
        - [ ] Unit Test
        - [ ] Manual test (add detailed scripts or steps below)
        - [ ] No need to test or manual test. Explain why:
    - [ ] This is a refactor/code format and no logic has been changed.
            - [ ] Previous test can cover this change.
            - [ ] No code files have been changed.
            - [ ] Other reason <!-- Add your reason?  -->
    
    - Behavior changed:
        - [ ] No.
        - [ ] Yes. <!-- Explain the behavior change -->
    
    - Does this need documentation?
        - [ ] No.
    - [ ] Yes. <!-- Add document PR link here. eg:
    https://github.com/apache/doris-website/pull/1214 -->
    
    ### Check List (For Reviewer who merge this PR)
    
    - [ ] Confirm the release note
    - [ ] Confirm test cases
    - [ ] Confirm document
    - [ ] Add branch pick label <!-- Add branch pick label that this PR
    should merge into -->
---
 be/src/olap/delete_handler.cpp                     |   2 +-
 be/src/util/jsonb_document_cast.h                  |   6 +-
 be/src/util/string_parser.cpp                      |  73 ++++---
 be/src/util/string_parser.hpp                      |  47 ++++-
 .../data_types/serde/data_type_number_serde.cpp    |  12 +-
 .../functions/cast/cast_to_basic_number_common.h   |  12 +-
 .../vec/function/cast/cast_to_decimal128_perf.cpp  | 220 +++++++++++++++++++++
 7 files changed, 320 insertions(+), 52 deletions(-)

diff --git a/be/src/olap/delete_handler.cpp b/be/src/olap/delete_handler.cpp
index 7c521aadfe1..3044b6e2d95 100644
--- a/be/src/olap/delete_handler.cpp
+++ b/be/src/olap/delete_handler.cpp
@@ -57,7 +57,7 @@ Status convert(const vectorized::DataTypePtr& data_type, 
const std::string& str,
     if constexpr (PType == TYPE_TINYINT || PType == TYPE_SMALLINT || PType == 
TYPE_INT ||
                   PType == TYPE_BIGINT || PType == TYPE_LARGEINT) {
         vectorized::CastParameters parameters;
-        if (!vectorized::CastToInt::from_string({str.data(), str.size()}, res, 
parameters)) {
+        if (!vectorized::CastToInt::from_string<false>({str.data(), 
str.size()}, res, parameters)) {
             return Status::Error<ErrorCode::INVALID_ARGUMENT>(
                     "invalid {} string. str={}", 
type_to_string(data_type->get_primitive_type()),
                     str);
diff --git a/be/src/util/jsonb_document_cast.h 
b/be/src/util/jsonb_document_cast.h
index 8f44bf91f54..bd1acbcd335 100644
--- a/be/src/util/jsonb_document_cast.h
+++ b/be/src/util/jsonb_document_cast.h
@@ -174,7 +174,11 @@ struct JsonbCast {
         case JsonbType::T_String: {
             const auto* blob = jsonb_value->unpack<JsonbBinaryVal>();
             StringRef str_ref {blob->getBlob(), blob->getBlobLen()};
-            return CastToInt::from_string(str_ref, to, params);
+            return std::visit(
+                    [&](auto is_strict_mode) {
+                        return CastToInt::from_string<is_strict_mode>(str_ref, 
to, params);
+                    },
+                    vectorized::make_bool_variant(params.is_strict));
         }
         default: {
             return false;
diff --git a/be/src/util/string_parser.cpp b/be/src/util/string_parser.cpp
index dcdbda71edc..adc19dc8fb6 100644
--- a/be/src/util/string_parser.cpp
+++ b/be/src/util/string_parser.cpp
@@ -20,6 +20,7 @@
 #include <limits>
 
 #include "vec/core/extended_types.h"
+#include "vec/core/types.h"
 namespace doris {
 #include "common/compile_check_avoid_begin.h"
 // Supported decimal number format:
@@ -78,17 +79,11 @@ typename PrimitiveTypeTraits<P>::CppType::NativeType 
StringParser::string_to_dec
         --len;
     }
     int int_part_count = 0;
-    std::vector<unsigned char> digits;
-    if (len > 0) {
-        digits.resize(len);
-    }
-    int total_digit_count = 0;
     int i = 0;
     for (; i != len; ++i) {
         const char& c = s[i];
         if (LIKELY('0' <= c && c <= '9')) {
             found_value = true;
-            digits[total_digit_count++] = c - '0';
             if (!found_dot) {
                 ++int_part_count;
             }
@@ -109,6 +104,7 @@ typename PrimitiveTypeTraits<P>::CppType::NativeType 
StringParser::string_to_dec
     }
     // parse exponent if any
     int64_t exponent = 0;
+    auto end_digit_index = i;
     if (i != len) {
         bool negative_exponent = false;
         if (s[i] == 'e' || s[i] == 'E') {
@@ -160,51 +156,69 @@ typename PrimitiveTypeTraits<P>::CppType::NativeType 
StringParser::string_to_dec
     // whose max value is std::numeric_limits<int32_t>::max() - 4,
     // so int_part_count will be in range of int32_t,
     // and int_part_count + exponent will be in range of int64_t
-    int64_t tmp_actual_int_part_count = int_part_count + exponent;
-    if (tmp_actual_int_part_count > std::numeric_limits<int>::max() ||
-        tmp_actual_int_part_count < std::numeric_limits<int>::min()) {
-        *result = StringParser::PARSE_OVERFLOW;
+    int64_t tmp_result_int_part_digit_count = int_part_count + exponent;
+    if (tmp_result_int_part_digit_count > std::numeric_limits<int>::max() ||
+        tmp_result_int_part_digit_count < std::numeric_limits<int>::min()) {
+        *result = is_negative ? StringParser::PARSE_UNDERFLOW : 
StringParser::PARSE_OVERFLOW;
         return 0;
     }
-    int actual_int_part_count = tmp_actual_int_part_count;
+    int result_int_part_digit_count = tmp_result_int_part_digit_count;
     int actual_frac_part_count = 0;
     int digit_index = 0;
-    if (actual_int_part_count >= 0) {
-        int max_index = std::min(actual_int_part_count, total_digit_count);
+    if (result_int_part_digit_count >= 0) {
+        int max_index = std::min(found_dot ? (result_int_part_digit_count +
+                                              ((int_part_count > 0 && exponent 
> 0) ? 1 : 0))
+                                           : result_int_part_digit_count,
+                                 end_digit_index);
+        max_index = (max_index == std::numeric_limits<int>::min() ? 
end_digit_index : max_index);
         // skip zero number
-        for (; digit_index != max_index && digits[digit_index] == 0; 
++digit_index) {
+        for (; digit_index != max_index && s[digit_index] == '0'; 
++digit_index) {
         }
         // test 0.00, .00, 0.{00...}e2147483647
         // 0.00000e2147483647
-        if (max_index - digit_index > type_precision - type_scale) {
+        if (digit_index != max_index &&
+            (result_int_part_digit_count - digit_index > type_precision - 
type_scale)) {
             *result = is_negative ? StringParser::PARSE_UNDERFLOW : 
StringParser::PARSE_OVERFLOW;
             return 0;
         }
         // get int part number
         for (; digit_index != max_index; ++digit_index) {
-            int_part_number = int_part_number * 10 + digits[digit_index];
+            if (UNLIKELY(s[digit_index] == '.')) {
+                continue;
+            }
+            int_part_number = int_part_number * 10 + (s[digit_index] - '0');
         }
-        if (digit_index != actual_int_part_count) {
-            int_part_number *= get_scale_multiplier<T>(actual_int_part_count - 
digit_index);
+        auto total_significant_digit_count = i - ((found_dot && int_part_count 
> 0) ? 1 : 0);
+        if (result_int_part_digit_count > total_significant_digit_count) {
+            int_part_number *= 
get_scale_multiplier<T>(result_int_part_digit_count -
+                                                       
total_significant_digit_count);
         }
     } else {
         // leading zeros of fraction part
-        actual_frac_part_count = -actual_int_part_count;
+        actual_frac_part_count = -result_int_part_digit_count;
     }
     // get fraction part number
-    for (; digit_index != total_digit_count && actual_frac_part_count < 
type_scale;
-         ++digit_index, ++actual_frac_part_count) {
-        frac_part_number = frac_part_number * 10 + digits[digit_index];
+    for (; digit_index != end_digit_index && actual_frac_part_count < 
type_scale; ++digit_index) {
+        if (UNLIKELY(s[digit_index] == '.')) {
+            continue;
+        }
+        frac_part_number = frac_part_number * 10 + (s[digit_index] - '0');
+        ++actual_frac_part_count;
     }
     auto type_scale_multiplier = get_scale_multiplier<T>(type_scale);
     // there are still extra fraction digits left, check rounding
-    if (digit_index != total_digit_count) {
-        // example: test 1.5 -> decimal(1, 0)
-        if (digits[digit_index] >= 5) {
-            ++frac_part_number;
-            if (frac_part_number == type_scale_multiplier) {
-                frac_part_number = 0;
-                ++int_part_number;
+    if (digit_index != end_digit_index) {
+        if (UNLIKELY(s[digit_index] == '.')) {
+            ++digit_index;
+        }
+        if (digit_index != end_digit_index) {
+            // example: test 1.5 -> decimal(1, 0)
+            if (s[digit_index] >= '5') {
+                ++frac_part_number;
+                if (frac_part_number == type_scale_multiplier) {
+                    frac_part_number = 0;
+                    ++int_part_number;
+                }
             }
         }
     } else {
@@ -221,6 +235,7 @@ typename PrimitiveTypeTraits<P>::CppType::NativeType 
StringParser::string_to_dec
     *result = StringParser::PARSE_SUCCESS;
     return is_negative ? T(-value) : T(value);
 }
+
 template vectorized::Int32 
StringParser::string_to_decimal<PrimitiveType::TYPE_DECIMAL32>(
         const char* __restrict s, size_t len, int type_precision, int 
type_scale,
         ParseResult* result);
diff --git a/be/src/util/string_parser.hpp b/be/src/util/string_parser.hpp
index 88e3ff031c0..65fdd201a77 100644
--- a/be/src/util/string_parser.hpp
+++ b/be/src/util/string_parser.hpp
@@ -97,6 +97,29 @@ inline const char* skip_ascii_whitespaces(const char* s, T& 
len) {
     return s;
 }
 
+template <typename T>
+inline const char* skip_leading_whitespace(const char* __restrict s, T& len) {
+    while (len > 0 && is_whitespace_ascii(*s)) {
+        ++s;
+        --len;
+    }
+
+    return s;
+}
+
+// skip trailing ascii whitespaces,
+// return the pointer to the first char,
+// and update the len to the new length, which does not include
+// trailing whitespaces
+template <typename T>
+inline const char* skip_trailing_whitespaces(const char* s, T& len) {
+    while (len > 0 && is_whitespace_ascii(s[len - 1])) {
+        --len;
+    }
+
+    return s;
+}
+
 template <bool (*Pred)(char)>
 bool range_suite(const char* s, const char* end) {
     return std::ranges::all_of(s, end, Pred);
@@ -308,7 +331,11 @@ public:
     // Assumes s represents a decimal number.
     template <typename T, bool enable_strict_mode = false>
     static inline T string_to_int(const char* __restrict s, size_t len, 
ParseResult* result) {
-        s = skip_ascii_whitespaces(s, len);
+        T ans = string_to_int_internal<T, enable_strict_mode>(s, len, result);
+        if (LIKELY(*result == PARSE_SUCCESS)) {
+            return ans;
+        }
+        s = skip_leading_whitespace(s, len);
         return string_to_int_internal<T, enable_strict_mode>(s, len, result);
     }
 
@@ -502,8 +529,13 @@ T StringParser::string_to_int_internal(const char* 
__restrict s, int len, ParseR
                     return 0;
                 }
             } else {
-                if ((UNLIKELY(i == first || (!is_all_whitespace(s + i, len - 
i) &&
-                                             !is_float_suffix(s + i, len - 
i))))) {
+                // Save original position where non-digit was found
+                int remaining_len = len - i;
+                const char* remaining_s = s + i;
+                // Skip trailing whitespaces from the remaining portion
+                remaining_s = skip_trailing_whitespaces(remaining_s, 
remaining_len);
+                if ((UNLIKELY(i == first || (remaining_len != 0 &&
+                                             !is_float_suffix(remaining_s, 
remaining_len))))) {
                     // Reject the string because either the first char was not 
a digit,
                     // or the remaining chars are not all whitespace
                     *result = PARSE_FAILURE;
@@ -652,8 +684,13 @@ T StringParser::string_to_int_no_overflow(const char* 
__restrict s, int len, Par
                     return 0;
                 }
             } else {
-                if ((UNLIKELY(!is_all_whitespace(s + i, len - i) &&
-                              !is_float_suffix(s + i, len - i)))) {
+                // Save original position where non-digit was found
+                int remaining_len = len - i;
+                const char* remaining_s = s + i;
+                // Skip trailing whitespaces from the remaining portion
+                remaining_s = skip_trailing_whitespaces(remaining_s, 
remaining_len);
+                if ((UNLIKELY(remaining_len != 0 &&
+                              !is_float_suffix(remaining_s, remaining_len)))) {
                     *result = PARSE_FAILURE;
                     return 0;
                 }
diff --git a/be/src/vec/data_types/serde/data_type_number_serde.cpp 
b/be/src/vec/data_types/serde/data_type_number_serde.cpp
index 454321ff433..1ad1b7fb523 100644
--- a/be/src/vec/data_types/serde/data_type_number_serde.cpp
+++ b/be/src/vec/data_types/serde/data_type_number_serde.cpp
@@ -709,7 +709,7 @@ void DataTypeNumberSerDe<T>::write_one_cell_to_jsonb(const 
IColumn& column,
     }
 }
 
-template <PrimitiveType PT>
+template <PrimitiveType PT, bool is_strict_mode>
 bool try_parse_impl(typename PrimitiveTypeTraits<PT>::CppType& x, const 
StringRef& str_ref,
                     CastParameters& params) {
     if constexpr (is_float_or_double(PT)) {
@@ -717,7 +717,7 @@ bool try_parse_impl(typename 
PrimitiveTypeTraits<PT>::CppType& x, const StringRe
     } else if constexpr (PT == TYPE_BOOLEAN) {
         return CastToBool::from_string(str_ref, x, params);
     } else if constexpr (is_int(PT)) {
-        return CastToInt::from_string(str_ref, x, params);
+        return CastToInt::from_string<is_strict_mode>(str_ref, x, params);
     } else {
         throw doris::Exception(ErrorCode::NOT_IMPLEMENTED_ERROR,
                                "try_parse_impl not implemented for type: {}", 
type_to_string(PT));
@@ -731,7 +731,7 @@ Status DataTypeNumberSerDe<T>::from_string(StringRef& str, 
IColumn& column,
     typename PrimitiveTypeTraits<T>::CppType val;
     CastParameters params;
     params.is_strict = false;
-    if (!try_parse_impl<T>(val, str, params)) {
+    if (!try_parse_impl<T, false>(val, str, params)) {
         return Status::InvalidArgument("parse number fail, string: '{}'", 
str.to_string());
     }
     column_data.insert_value(val);
@@ -745,7 +745,7 @@ Status 
DataTypeNumberSerDe<T>::from_string_strict_mode(StringRef& str, IColumn&
     typename PrimitiveTypeTraits<T>::CppType val;
     CastParameters params;
     params.is_strict = true;
-    if (!try_parse_impl<T>(val, str, params)) {
+    if (!try_parse_impl<T, true>(val, str, params)) {
         return Status::InvalidArgument("parse number fail, string: '{}'", 
str.to_string());
     }
     column_data.insert_value(val);
@@ -773,7 +773,7 @@ Status DataTypeNumberSerDe<T>::from_string_batch(const 
ColumnString& str, Column
         size_t string_size = next_offset - current_offset;
 
         StringRef str_ref(&(*chars)[current_offset], string_size);
-        null_map[i] = !try_parse_impl<T>(vec_to[i], str_ref, params);
+        null_map[i] = !try_parse_impl<T, false>(vec_to[i], str_ref, params);
         current_offset = next_offset;
     }
     return Status::OK();
@@ -802,7 +802,7 @@ Status 
DataTypeNumberSerDe<T>::from_string_strict_mode_batch(
         size_t string_size = next_offset - current_offset;
 
         StringRef str_ref(&(*chars)[current_offset], string_size);
-        if (!try_parse_impl<T>(vec_to[i], str_ref, params)) {
+        if (!try_parse_impl<T, true>(vec_to[i], str_ref, params)) {
             return Status::InvalidArgument(
                     "parse number fail, string: '{}'",
                     std::string((char*)&(*chars)[current_offset], 
string_size));
diff --git a/be/src/vec/functions/cast/cast_to_basic_number_common.h 
b/be/src/vec/functions/cast/cast_to_basic_number_common.h
index bc91cf4d589..90f1c087b6e 100644
--- a/be/src/vec/functions/cast/cast_to_basic_number_common.h
+++ b/be/src/vec/functions/cast/cast_to_basic_number_common.h
@@ -132,18 +132,10 @@ constexpr bool IsCppTypeDateTime =
         std::is_same_v<CppT, PrimitiveTypeTraits<TYPE_DATETIME>::CppType> ||
         std::is_same_v<CppT, PrimitiveTypeTraits<TYPE_DATETIMEV2>::CppType>;
 struct CastToInt {
-    template <typename ToCppT>
+    template <bool is_strict_mode, typename ToCppT>
         requires(IsCppTypeInt<ToCppT>)
     static inline bool from_string(const StringRef& from, ToCppT& to, 
CastParameters& params) {
-        return std::visit(
-                [&](auto is_strict_mode) {
-                    if constexpr (is_strict_mode) {
-                        return try_read_int_text<ToCppT, true>(to, from);
-                    } else {
-                        return try_read_int_text<ToCppT, false>(to, from);
-                    }
-                },
-                vectorized::make_bool_variant(params.is_strict));
+        return try_read_int_text<ToCppT, is_strict_mode>(to, from);
     }
 
     template <typename FromCppT, typename ToCppT>
diff --git a/be/test/vec/function/cast/cast_to_decimal128_perf.cpp 
b/be/test/vec/function/cast/cast_to_decimal128_perf.cpp
index e56d845c38b..f381b44365a 100644
--- a/be/test/vec/function/cast/cast_to_decimal128_perf.cpp
+++ b/be/test/vec/function/cast/cast_to_decimal128_perf.cpp
@@ -27,6 +27,7 @@
 #include "vec/columns/column_vector.h"
 #include "vec/columns/common_column_test.h"
 #include "vec/core/field.h"
+#include "vec/json/json_parser.h"
 
 namespace doris::vectorized {
 struct FunctionCastToDecimalPerfTest : public FunctionCastTest {
@@ -76,6 +77,108 @@ struct FunctionCastToDecimalPerfTest : public 
FunctionCastTest {
                                  type_to_string(ToPT),
                                  PrettyPrinter::print(duration_ns, 
TUnit::TIME_NS));
     }
+    template <PrimitiveType ToPT>
+    int64_t perf_test_string_to_decimal(const MutableColumnPtr& column_orig, 
size_t orig_line_count,
+                                        int to_precision, int to_scale, bool 
nullable) {
+        // Create string column and load data from file
+        auto dt_from = std::make_shared<DataTypeString>();
+        // Replicate data 10 times for better performance measurement
+        auto column_from = dt_from->create_column();
+        for (int i = 0; i < 10; ++i) {
+            column_from->insert_range_from(*column_orig, 0, 
column_orig->size());
+        }
+        EXPECT_EQ(column_from->size(), orig_line_count * 10);
+        std::cout << "column_from size: " << column_from->size() << std::endl;
+
+        // Create target decimal type
+        DataTypePtr dt_to = DataTypeFactory::instance().create_data_type(ToPT, 
nullable,
+                                                                         
to_precision, to_scale);
+
+        // Setup cast context
+        bool is_strict_mode = false;
+        auto ctx = create_context(is_strict_mode);
+        auto fn = get_cast_wrapper(ctx.get(), dt_from, dt_to);
+
+        Block block = {
+                {std::move(column_from), dt_from, "from"},
+                {nullptr, dt_to, "to"},
+        };
+
+        // Measure performance
+        int64_t duration_ns = 0;
+        {
+            SCOPED_RAW_TIMER(&duration_ns);
+            EXPECT_TRUE(fn(ctx.get(), block, {0}, 1, block.rows(), nullptr));
+        }
+
+        auto result = block.get_by_position(1).column;
+        EXPECT_EQ(result->size(), orig_line_count * 10);
+
+        // Display results
+        // std::cout << block.dump_data(0, 10) << "\n";
+        std::cout << fmt::format("cast STRING to {} (precision={}, scale={}) 
time: {}\n",
+                                 type_to_string(ToPT), to_precision, to_scale,
+                                 PrettyPrinter::print(duration_ns, 
TUnit::TIME_NS));
+
+        // Calculate throughput
+        double rows_per_second = (orig_line_count * 10.0) / (duration_ns / 
1e9);
+        std::cout << fmt::format("Throughput: {:.2f} million rows/second\n", 
rows_per_second / 1e6);
+        return duration_ns;
+    }
+
+    // Variant with strict mode support
+    template <PrimitiveType ToPT>
+    void perf_test_string_to_decimal_strict(const std::string& test_data_file,
+                                            size_t orig_line_count, int 
to_precision, int to_scale,
+                                            bool nullable, bool strict_mode) {
+        auto dt_from = std::make_shared<DataTypeString>();
+        auto column_orig = dt_from->create_column();
+
+        {
+            MutableColumns columns;
+            columns.push_back(column_orig->get_ptr());
+            DataTypeSerDeSPtrs serde = {dt_from->get_serde()};
+            load_columns_data_from_file(columns, serde, ';', {0}, 
test_data_file);
+            EXPECT_EQ(column_orig->size(), orig_line_count);
+        }
+
+        auto column_from = dt_from->create_column();
+        for (int i = 0; i < 10; ++i) {
+            column_from->insert_range_from(*column_orig, 0, 
column_orig->size());
+        }
+        EXPECT_EQ(column_from->size(), orig_line_count * 10);
+        std::cout << "column_from size: " << column_from->size() << std::endl;
+
+        DataTypePtr dt_to = DataTypeFactory::instance().create_data_type(ToPT, 
nullable,
+                                                                         
to_precision, to_scale);
+
+        auto ctx = create_context(strict_mode);
+        auto fn = get_cast_wrapper(ctx.get(), dt_from, dt_to);
+        ASSERT_TRUE(fn != nullptr);
+
+        Block block = {
+                {std::move(column_from), dt_from, "from"},
+                {nullptr, dt_to, "to"},
+        };
+
+        int64_t duration_ns = 0;
+        {
+            SCOPED_RAW_TIMER(&duration_ns);
+            EXPECT_TRUE(fn(ctx.get(), block, {0}, 1, block.rows(), nullptr));
+        }
+
+        auto result = block.get_by_position(1).column;
+        EXPECT_EQ(result->size(), orig_line_count * 10);
+
+        std::cout << block.dump_data(0, 10) << "\n";
+        std::cout << fmt::format(
+                "cast STRING to {} (precision={}, scale={}, strict_mode={}) 
time: {}\n",
+                type_to_string(ToPT), to_precision, to_scale, strict_mode,
+                PrettyPrinter::print(duration_ns, TUnit::TIME_NS));
+
+        double rows_per_second = (orig_line_count * 10.0) / (duration_ns / 
1e9);
+        std::cout << fmt::format("Throughput: {:.2f} million rows/second\n", 
rows_per_second / 1e6);
+    }
 };
 
 // Optimized version with reserve for better performance
@@ -100,6 +203,123 @@ void read_file_to_vector_optimized(const std::string& 
filename, std::vector<std:
         lines.push_back(std::move(line)); // Move instead of copy
     }
 }
+// Test cases
+/*
+TEST_F(FunctionCastToDecimalPerfTest, test_string_to_decimal128v3_perf) {
+    std::string test_data_file = 
"/mnt/disk2/tengjianping/cast_perf/decimal_test_data_38_19.txt";
+    size_t orig_line_count = 10000000; // 10 million lines
+    // Create string column and load data from file
+    auto dt_from = std::make_shared<DataTypeString>();
+    auto column_orig = dt_from->create_column();
+
+    {
+        MutableColumns columns;
+        columns.push_back(column_orig->get_ptr());
+        DataTypeSerDeSPtrs serde = {dt_from->get_serde()};
+        load_columns_data_from_file(columns, serde, ';', {0}, test_data_file);
+        EXPECT_EQ(column_orig->size(), orig_line_count);
+    }
+
+    int test_rounds = 10;
+    int64_t total_duration_ns = 0;
+    for (int i = 0; i < test_rounds; ++i) {
+        std::cout << "\n--- Test Round " << (i + 1) << " ---\n";
+        total_duration_ns += 
perf_test_string_to_decimal<PrimitiveType::TYPE_DECIMAL128I>(
+                column_orig, orig_line_count, 38, 19, false);
+    }
+    auto avg_duration_ns = total_duration_ns / test_rounds;
+    std::cout << fmt::format("\nAverage time over {} rounds: {}\n", 
test_rounds,
+                             PrettyPrinter::print(avg_duration_ns, 
TUnit::TIME_NS));
+}
+
+TEST_F(FunctionCastToDecimalPerfTest, 
test_string_to_decimal128v3_nullable_perf) {
+    std::string test_data_file = 
"/mnt/disk2/tengjianping/cast_perf/decimal_test_data_38_19.txt";
+    size_t orig_line_count = 10000000; // 10 million lines
+    // Create string column and load data from file
+    auto dt_from = std::make_shared<DataTypeString>();
+    auto column_orig = dt_from->create_column();
+
+    {
+        MutableColumns columns;
+        columns.push_back(column_orig->get_ptr());
+        DataTypeSerDeSPtrs serde = {dt_from->get_serde()};
+        load_columns_data_from_file(columns, serde, ';', {0}, test_data_file);
+        EXPECT_EQ(column_orig->size(), orig_line_count);
+    }
+
+    int test_rounds = 10;
+    int64_t total_duration_ns = 0;
+    for (int i = 0; i < test_rounds; ++i) {
+        std::cout << "\n--- Test Round " << (i + 1) << " ---\n";
+        total_duration_ns += 
perf_test_string_to_decimal<PrimitiveType::TYPE_DECIMAL128I>(
+                column_orig, orig_line_count, 38, 19, true);
+    }
+    auto avg_duration_ns = total_duration_ns / test_rounds;
+    std::cout << fmt::format("\nAverage time over {} rounds: {}\n", 
test_rounds,
+                             PrettyPrinter::print(avg_duration_ns, 
TUnit::TIME_NS));
+}
+
+TEST_F(FunctionCastToDecimalPerfTest, test_string_to_decimal64v3_perf) {
+    std::string test_data_file = 
"/mnt/disk2/tengjianping/cast_perf/decimal_test_data_18_9.txt";
+    size_t orig_line_count = 10000000;
+    perf_test_string_to_decimal<PrimitiveType::TYPE_DECIMAL64>(test_data_file, 
orig_line_count, 18,
+                                                               9, false);
+}
+
+TEST_F(FunctionCastToDecimalPerfTest, test_string_to_decimal32v3_perf) {
+    std::string test_data_file = 
"/mnt/disk2/tengjianping/cast_perf/decimal_test_data_9_4.txt";
+    size_t orig_line_count = 10000000;
+    perf_test_string_to_decimal<PrimitiveType::TYPE_DECIMAL32>(test_data_file, 
orig_line_count, 9,
+                                                               4, false);
+}
+
+TEST_F(FunctionCastToDecimalPerfTest, test_string_to_decimalv2_perf) {
+    std::string test_data_file = 
"/mnt/disk2/tengjianping/cast_perf/decimalv2_test_data_27_9.txt";
+    size_t orig_line_count = 10000000;
+    perf_test_string_to_decimal<PrimitiveType::TYPE_DECIMALV2>(test_data_file, 
orig_line_count, 27,
+                                                               9, false);
+}
+
+TEST_F(FunctionCastToDecimalPerfTest, 
test_string_to_decimal128v3_strict_mode_perf) {
+    std::string test_data_file = 
"/mnt/disk2/tengjianping/cast_perf/decimal_test_data_38_19.txt";
+    size_t orig_line_count = 10000000;
+
+    std::cout << "\n=== Testing with strict_mode=false ===\n";
+    perf_test_string_to_decimal_strict<PrimitiveType::TYPE_DECIMAL128I>(
+            test_data_file, orig_line_count, 38, 19, false, false);
+
+    std::cout << "\n=== Testing with strict_mode=true ===\n";
+    perf_test_string_to_decimal_strict<PrimitiveType::TYPE_DECIMAL128I>(
+            test_data_file, orig_line_count, 38, 19, false, true);
+}
+
+// Comparison test: String vs Direct decimal parsing
+TEST_F(FunctionCastToDecimalPerfTest, test_string_parse_comparison) {
+    std::vector<std::string> file_contents;
+    
read_file_to_vector_optimized("/mnt/disk2/tengjianping/cast_perf/decimal_test_data_38_19.txt",
+                                  file_contents);
+
+    size_t line_count = file_contents.size();
+    std::cout << "Read " << line_count << " lines" << std::endl;
+
+    // Test 1: Direct StringParser
+    int64_t duration_ns_parser = 0;
+    StringParser::ParseResult result;
+    {
+        SCOPED_RAW_TIMER(&duration_ns_parser);
+        for (size_t i = 0; i != line_count; ++i) {
+            auto decimal_value = 
StringParser::string_to_decimal<PrimitiveType::TYPE_DECIMAL128I>(
+                    file_contents[i].c_str(), file_contents[i].size(), 38, 19, 
&result);
+            (void)decimal_value;
+        }
+    }
+    std::cout << "Direct StringParser time: "
+              << PrettyPrinter::print(duration_ns_parser, TUnit::TIME_NS) << 
"\n";
+
+    // Test 2: Cast function (already tested in other tests)
+    std::cout << "\nFor cast function performance, see 
test_string_to_decimal128v3_perf\n";
+}
+    */
 
 /*
 TEST_F(FunctionCastToDecimalPerfTest, test_to_decimal128v3_from_string_perf) {


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to