This is an automated email from the ASF dual-hosted git repository.

dataroaring pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/branch-3.0 by this push:
     new 2b3f8eef6ee  [Improvement](column) add santy check and add some fix 
for ColumnString #47964 (#48513)
2b3f8eef6ee is described below

commit 2b3f8eef6ee3050664d739cbb0071be177f9f0b8
Author: Pxl <x...@selectdb.com>
AuthorDate: Thu Mar 20 14:32:01 2025 +0800

     [Improvement](column) add santy check and add some fix for ColumnString 
#47964 (#48513)
    
    pick part of https://github.com/apache/doris/pull/47964
---
 be/src/vec/columns/column_string.cpp             | 56 +++++++++++++++++++-----
 be/src/vec/columns/column_string.h               | 12 +++++
 be/src/vec/functions/date_time_transforms.h      |  2 +
 be/src/vec/functions/function_decode_varchar.cpp |  2 +-
 be/src/vec/functions/function_ip.h               |  2 +
 be/src/vec/functions/function_uuid.cpp           |  1 +
 6 files changed, 62 insertions(+), 13 deletions(-)

diff --git a/be/src/vec/columns/column_string.cpp 
b/be/src/vec/columns/column_string.cpp
index 8bcc6565467..b61e28ce3f8 100644
--- a/be/src/vec/columns/column_string.cpp
+++ b/be/src/vec/columns/column_string.cpp
@@ -36,20 +36,30 @@ namespace doris::vectorized {
 
 template <typename T>
 void ColumnStr<T>::sanity_check() const {
-    auto count = offsets.size();
+#ifndef NDEBUG
+    sanity_check_simple();
+    auto count = (int)offsets.size();
+    for (int i = 0; i < count; ++i) {
+        if (offsets[i] < offsets[i - 1]) {
+            throw Exception(Status::InternalError("row count: {}, offsets[{}]: 
{}, offsets[{}]: {}",
+                                                  count, i, offsets[i], i - 1, 
offsets[i - 1]));
+        }
+    }
+#endif
+}
+
+template <typename T>
+void ColumnStr<T>::sanity_check_simple() const {
+#ifndef NDEBUG
+    auto count = (int)offsets.size();
     if (chars.size() != offsets[count - 1]) {
-        LOG(FATAL) << "row count: " << count << ", chars.size(): " << 
chars.size() << ", offset["
-                   << count - 1 << "]: " << offsets[count - 1];
+        throw Exception(Status::InternalError("row count: {}, chars.size(): 
{}, offset[{}]: {}",
+                                              count, chars.size(), count - 1, 
offsets[count - 1]));
     }
     if (offsets[-1] != 0) {
-        LOG(FATAL) << "wrong offsets[-1]: " << offsets[-1];
-    }
-    for (size_t i = 0; i < count; ++i) {
-        if (offsets[i] < offsets[i - 1]) {
-            LOG(FATAL) << "row count: " << count << ", offsets[" << i << "]: " 
<< offsets[i]
-                       << ", offsets[" << i - 1 << "]: " << offsets[i - 1];
-        }
+        throw Exception(Status::InternalError("wrong offsets[-1]: {}", 
offsets[-1]));
     }
+#endif
 }
 
 template <typename T>
@@ -77,6 +87,8 @@ MutableColumnPtr ColumnStr<T>::clone_resized(size_t to_size) 
const {
         res->offsets.resize_fill(to_size, chars.size());
     }
 
+    res->sanity_check_simple();
+
     return res;
 }
 
@@ -135,6 +147,14 @@ void ColumnStr<T>::insert_range_from_ignore_overflow(const 
doris::vectorized::IC
                     src_concrete.offsets[start + i] - nested_offset + 
prev_max_offset;
         }
     }
+#ifndef NDEBUG
+    auto count = int64_t(offsets.size());
+    // offsets may overflow, so we make chars.size() as T to do same overflow 
check
+    if (offsets.back() != T(chars.size())) {
+        throw Exception(Status::InternalError("row count: {}, chars.size(): 
{}, offset[{}]: {}",
+                                              count, chars.size(), count - 1, 
offsets[count - 1]));
+    }
+#endif
 }
 
 template <typename T>
@@ -178,6 +198,7 @@ void ColumnStr<T>::insert_range_from(const IColumn& src, 
size_t start, size_t le
     } else {
         do_insert(assert_cast<const ColumnStr<uint32_t>&>(src));
     }
+    sanity_check_simple();
 }
 
 template <typename T>
@@ -244,6 +265,7 @@ void ColumnStr<T>::insert_indices_from(const IColumn& src, 
const uint32_t* indic
     } else {
         do_insert(assert_cast<const ColumnStr<uint32_t>&>(src));
     }
+    sanity_check_simple();
 }
 
 template <typename T>
@@ -297,7 +319,9 @@ size_t ColumnStr<T>::filter(const IColumn::Filter& filter) {
     }
 
     if constexpr (std::is_same_v<UInt32, T>) {
-        return filter_arrays_impl<UInt8, IColumn::Offset>(chars, offsets, 
filter);
+        auto res = filter_arrays_impl<UInt8, IColumn::Offset>(chars, offsets, 
filter);
+        sanity_check_simple();
+        return res;
     } else {
         throw doris::Exception(doris::ErrorCode::INTERNAL_ERROR,
                                "should not call filter in ColumnStr<UInt64>");
@@ -373,7 +397,7 @@ ColumnPtr ColumnStr<T>::permute(const IColumn::Permutation& 
perm, size_t limit)
         current_new_offset += string_size;
         res_offsets[i] = current_new_offset;
     }
-
+    sanity_check_simple();
     return res;
 }
 
@@ -405,6 +429,7 @@ const char* 
ColumnStr<T>::deserialize_and_insert_from_arena(const char* pos) {
     memcpy(chars.data() + old_size, pos, string_size);
 
     offsets.push_back(new_size);
+    sanity_check_simple();
     return pos + string_size;
 }
 
@@ -495,6 +520,7 @@ void 
ColumnStr<T>::deserialize_vec_with_null_map(std::vector<StringRef>& keys,
             insert_default();
         }
     }
+    sanity_check_simple();
 }
 
 template <typename T>
@@ -558,6 +584,7 @@ ColumnPtr ColumnStr<T>::replicate(const IColumn::Offsets& 
replicate_offsets) con
             res_chars.resize(res_chars.size() + string_size);
             
memcpy_small_allow_read_write_overflow15(&res_chars[res_chars.size() - 
string_size],
                                                      
&chars[prev_string_offset], string_size);
+            check_chars_length(res_chars.size(), res_offsets.size());
         }
 
         prev_replicate_offset = replicate_offsets[i];
@@ -565,6 +592,7 @@ ColumnPtr ColumnStr<T>::replicate(const IColumn::Offsets& 
replicate_offsets) con
     }
 
     check_chars_length(res_chars.size(), res_offsets.size());
+    sanity_check_simple();
     return res;
 }
 
@@ -572,6 +600,7 @@ template <typename T>
 void ColumnStr<T>::reserve(size_t n) {
     offsets.reserve(n);
     chars.reserve(n);
+    sanity_check_simple();
 }
 
 template <typename T>
@@ -579,9 +608,11 @@ void ColumnStr<T>::resize(size_t n) {
     auto origin_size = size();
     if (origin_size > n) {
         offsets.resize(n);
+        chars.resize(offsets[n - 1]);
     } else if (origin_size < n) {
         insert_many_defaults(n - origin_size);
     }
+    sanity_check_simple();
 }
 
 template <typename T>
@@ -637,6 +668,7 @@ ColumnPtr ColumnStr<T>::convert_column_if_overflow() {
         }
 
         offsets.clear();
+        new_col->sanity_check_simple();
         return new_col;
     }
     return this->get_ptr();
diff --git a/be/src/vec/columns/column_string.h 
b/be/src/vec/columns/column_string.h
index e696b6f0764..33afabfc576 100644
--- a/be/src/vec/columns/column_string.h
+++ b/be/src/vec/columns/column_string.h
@@ -108,6 +108,7 @@ public:
     bool is_variable_length() const override { return true; }
     // used in string ut testd
     void sanity_check() const;
+    void sanity_check_simple() const;
     const char* get_family_name() const override { return "String"; }
 
     size_t size() const override { return offsets.size(); }
@@ -162,6 +163,7 @@ public:
         chars.resize(new_size);
         memcpy(chars.data() + old_size, s.data, size_to_append);
         offsets.push_back(new_size);
+        sanity_check_simple();
     }
 
     void insert_many_from(const IColumn& src, size_t position, size_t length) 
override;
@@ -188,6 +190,7 @@ public:
                                                      size_to_append);
             offsets.push_back(new_size);
         }
+        sanity_check_simple();
     }
 
     void insert_data(const char* pos, size_t length) override {
@@ -200,6 +203,7 @@ public:
             memcpy(chars.data() + old_size, pos, length);
         }
         offsets.push_back(new_size);
+        sanity_check_simple();
     }
 
     void insert_data_without_reserve(const char* pos, size_t length) {
@@ -212,6 +216,7 @@ public:
             memcpy(chars.data() + old_size, pos, length);
         }
         offsets.push_back_without_reserve(new_size);
+        sanity_check_simple();
     }
 
     /// Before insert strings, the caller should calculate the total size of 
strings,
@@ -245,6 +250,7 @@ public:
         }
         check_chars_length(offset, offsets.size());
         chars.resize(offset);
+        sanity_check_simple();
     }
 
     void insert_many_continuous_binary_data(const char* data, const uint32_t* 
offsets_,
@@ -270,6 +276,7 @@ public:
             offsets_ptr[i] = tail_offset + offsets_[i + 1] - begin_offset;
         }
         DCHECK(chars.size() == offsets.back());
+        sanity_check_simple();
     }
 
     void insert_many_binary_data(char* data_array, uint32_t* len_array,
@@ -293,6 +300,7 @@ public:
             offset += len;
             offsets.push_back(offset);
         }
+        sanity_check_simple();
     }
 
     void insert_many_strings(const StringRef* strings, size_t num) override {
@@ -315,6 +323,7 @@ public:
             }
             offsets.push_back(offset);
         }
+        sanity_check_simple();
     }
 
     template <size_t copy_length>
@@ -339,6 +348,7 @@ public:
             offsets.push_back(offset);
         }
         chars.resize(old_size + new_size);
+        sanity_check_simple();
     }
 
     void insert_many_strings_overflow(const StringRef* strings, size_t num,
@@ -380,12 +390,14 @@ public:
             memcpy(chars.data() + old_size, src.data, src.size);
             old_size += src.size;
         }
+        sanity_check_simple();
     }
 
     void pop_back(size_t n) override {
         size_t nested_n = offsets.back() - offset_at(offsets.size() - n);
         chars.resize(chars.size() - nested_n);
         offsets.resize_assume_reserved(offsets.size() - n);
+        sanity_check_simple();
     }
 
     StringRef serialize_value_into_arena(size_t n, Arena& arena, char const*& 
begin) const override;
diff --git a/be/src/vec/functions/date_time_transforms.h 
b/be/src/vec/functions/date_time_transforms.h
index 73155afae3a..33accd9f706 100644
--- a/be/src/vec/functions/date_time_transforms.h
+++ b/be/src/vec/functions/date_time_transforms.h
@@ -316,6 +316,7 @@ struct TransformerToStringOneArgument {
             res_offsets[i] = Transform::execute(date_time_value, res_data, 
offset);
             null_map[i] = !date_time_value.is_valid_date();
         }
+        res_data.resize(res_offsets[res_offsets.size() - 1]);
     }
 
     static void vector(FunctionContext* context,
@@ -334,6 +335,7 @@ struct TransformerToStringOneArgument {
             res_offsets[i] = Transform::execute(date_time_value, res_data, 
offset);
             DCHECK(date_time_value.is_valid_date());
         }
+        res_data.resize(res_offsets[res_offsets.size() - 1]);
     }
 };
 
diff --git a/be/src/vec/functions/function_decode_varchar.cpp 
b/be/src/vec/functions/function_decode_varchar.cpp
index 59f7ecfac04..c4aabc92eb1 100644
--- a/be/src/vec/functions/function_decode_varchar.cpp
+++ b/be/src/vec/functions/function_decode_varchar.cpp
@@ -106,7 +106,7 @@ public:
             simd::reverse_copy_bytes(col_res_data.data() + col_res_offset[i - 
1], str_size,
                                      ui8_ptr + sizeof(IntegerType) - str_size, 
str_size);
         }
-
+        col_res_data.resize(col_res_offset[col_res_offset.size() - 1]);
         block.get_by_position(result).column = std::move(col_res);
 
         return Status::OK();
diff --git a/be/src/vec/functions/function_ip.h 
b/be/src/vec/functions/function_ip.h
index 00b43955372..9c4737c84a7 100644
--- a/be/src/vec/functions/function_ip.h
+++ b/be/src/vec/functions/function_ip.h
@@ -335,6 +335,7 @@ public:
             process_ipv6_column<ColumnString>(column, input_rows_count, 
vec_res, offsets_res,
                                               null_map, ipv6_address_data);
         }
+        vec_res.resize(offsets_res[offsets_res.size() - 1]);
 
         block.replace_by_position(result,
                                   ColumnNullable::create(std::move(col_res), 
std::move(null_map)));
@@ -1388,6 +1389,7 @@ public:
             cut_address(address, pos, bytes_to_cut_count);
             offsets_res[i] = pos - begin;
         }
+        chars_res.resize(offsets_res[offsets_res.size() - 1]);
 
         block.replace_by_position(result, std::move(col_res));
         return Status::OK();
diff --git a/be/src/vec/functions/function_uuid.cpp 
b/be/src/vec/functions/function_uuid.cpp
index cee5fd7a363..bc4dec00705 100644
--- a/be/src/vec/functions/function_uuid.cpp
+++ b/be/src/vec/functions/function_uuid.cpp
@@ -180,6 +180,7 @@ public:
             col_offset[row] = col_offset[row - 1] + str_length;
             deserialize((char*)arg, col_data.data() + str_length * row);
         }
+        col_data.resize(str_length * input_rows_count);
         block.replace_by_position(result, std::move(result_column));
         return Status::OK();
     }


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to