This is an automated email from the ASF dual-hosted git repository.

yiguolei pushed a commit to branch branch-2.1
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/branch-2.1 by this push:
     new 548c79f3362 [Improvement](column) add santy check and add some fix for 
ColumnString #47964  (#48512)
548c79f3362 is described below

commit 548c79f33620024676f0610665e8966e0a69e0f0
Author: Pxl <x...@selectdb.com>
AuthorDate: Tue Mar 4 21:47:16 2025 +0800

    [Improvement](column) add santy check and add some fix for ColumnString 
#47964  (#48512)
    
    pick part of #47964
---
 be/src/vec/columns/column_string.cpp        | 33 +++++++++++++++++++++++++++--
 be/src/vec/columns/column_string.h          | 12 +++++++++++
 be/src/vec/functions/date_time_transforms.h |  2 ++
 be/src/vec/functions/function_ip.h          |  2 ++
 be/src/vec/functions/function_uuid.cpp      |  1 +
 5 files changed, 48 insertions(+), 2 deletions(-)

diff --git a/be/src/vec/columns/column_string.cpp 
b/be/src/vec/columns/column_string.cpp
index c085fdc5493..8cdc44da02a 100644
--- a/be/src/vec/columns/column_string.cpp
+++ b/be/src/vec/columns/column_string.cpp
@@ -51,6 +51,20 @@ void ColumnStr<T>::sanity_check() const {
     }
 }
 
+template <typename T>
+void ColumnStr<T>::sanity_check_simple() const {
+#ifndef NDEBUG
+    auto count = offsets.size();
+    if (chars.size() != offsets[count - 1]) {
+        throw Exception(Status::InternalError("row count: {}, chars.size(): 
{}, offset[{}]: {}",
+                                              count, chars.size(), count - 1, 
offsets[count - 1]));
+    }
+    if (offsets[-1] != 0) {
+        throw Exception(Status::InternalError("wrong offsets[-1]: {}", 
offsets[-1]));
+    }
+#endif
+}
+
 template <typename T>
 MutableColumnPtr ColumnStr<T>::clone_resized(size_t to_size) const {
     auto res = ColumnStr<T>::create();
@@ -76,6 +90,8 @@ MutableColumnPtr ColumnStr<T>::clone_resized(size_t to_size) 
const {
         res->offsets.resize_fill(to_size, chars.size());
     }
 
+    res->sanity_check_simple();
+
     return res;
 }
 
@@ -134,6 +150,7 @@ void ColumnStr<T>::insert_range_from_ignore_overflow(const 
doris::vectorized::IC
                     src_concrete.offsets[start + i] - nested_offset + 
prev_max_offset;
         }
     }
+    sanity_check_simple();
 }
 
 template <typename T>
@@ -177,6 +194,7 @@ void ColumnStr<T>::insert_range_from(const IColumn& src, 
size_t start, size_t le
     } else {
         do_insert(assert_cast<const ColumnStr<uint32_t>&>(src));
     }
+    sanity_check_simple();
 }
 
 template <typename T>
@@ -220,6 +238,7 @@ void ColumnStr<T>::insert_indices_from(const IColumn& src, 
const uint32_t* indic
     } else {
         do_insert(assert_cast<const ColumnStr<uint32_t>&>(src));
     }
+    sanity_check_simple();
 }
 
 template <typename T>
@@ -273,7 +292,9 @@ size_t ColumnStr<T>::filter(const IColumn::Filter& filter) {
     }
 
     if constexpr (std::is_same_v<UInt32, T>) {
-        return filter_arrays_impl<UInt8, IColumn::Offset>(chars, offsets, 
filter);
+        auto res = filter_arrays_impl<UInt8, IColumn::Offset>(chars, offsets, 
filter);
+        sanity_check_simple();
+        return res;
     } else {
         throw doris::Exception(doris::ErrorCode::INTERNAL_ERROR,
                                "should not call filter in ColumnStr<UInt64>");
@@ -349,7 +370,7 @@ ColumnPtr ColumnStr<T>::permute(const IColumn::Permutation& 
perm, size_t limit)
         current_new_offset += string_size;
         res_offsets[i] = current_new_offset;
     }
-
+    sanity_check_simple();
     return res;
 }
 
@@ -381,6 +402,7 @@ const char* 
ColumnStr<T>::deserialize_and_insert_from_arena(const char* pos) {
     memcpy(chars.data() + old_size, pos, string_size);
 
     offsets.push_back(new_size);
+    sanity_check_simple();
     return pos + string_size;
 }
 
@@ -446,6 +468,7 @@ void 
ColumnStr<T>::deserialize_vec_with_null_map(std::vector<StringRef>& keys,
             insert_default();
         }
     }
+    sanity_check_simple();
 }
 
 template <typename T>
@@ -558,6 +581,7 @@ ColumnPtr ColumnStr<T>::replicate(const IColumn::Offsets& 
replicate_offsets) con
             res_chars.resize(res_chars.size() + string_size);
             
memcpy_small_allow_read_write_overflow15(&res_chars[res_chars.size() - 
string_size],
                                                      
&chars[prev_string_offset], string_size);
+            check_chars_length(res_chars.size(), res_offsets.size());
         }
 
         prev_replicate_offset = replicate_offsets[i];
@@ -565,6 +589,7 @@ ColumnPtr ColumnStr<T>::replicate(const IColumn::Offsets& 
replicate_offsets) con
     }
 
     check_chars_length(res_chars.size(), res_offsets.size());
+    sanity_check_simple();
     return res;
 }
 
@@ -572,6 +597,7 @@ template <typename T>
 void ColumnStr<T>::reserve(size_t n) {
     offsets.reserve(n);
     chars.reserve(n);
+    sanity_check_simple();
 }
 
 template <typename T>
@@ -579,9 +605,11 @@ void ColumnStr<T>::resize(size_t n) {
     auto origin_size = size();
     if (origin_size > n) {
         offsets.resize(n);
+        chars.resize(offsets[n - 1]);
     } else if (origin_size < n) {
         insert_many_defaults(n - origin_size);
     }
+    sanity_check_simple();
 }
 
 template <typename T>
@@ -645,6 +673,7 @@ ColumnPtr ColumnStr<T>::convert_column_if_overflow() {
         }
 
         offsets.clear();
+        new_col->sanity_check_simple();
         return new_col;
     }
     return this->get_ptr();
diff --git a/be/src/vec/columns/column_string.h 
b/be/src/vec/columns/column_string.h
index 5fe8d298818..9106e67b2b5 100644
--- a/be/src/vec/columns/column_string.h
+++ b/be/src/vec/columns/column_string.h
@@ -108,6 +108,7 @@ public:
     bool is_variable_length() const override { return true; }
     // used in string ut testd
     void sanity_check() const;
+    void sanity_check_simple() const;
     const char* get_family_name() const override { return "String"; }
 
     size_t size() const override { return offsets.size(); }
@@ -162,6 +163,7 @@ public:
         chars.resize(new_size);
         memcpy(chars.data() + old_size, s.data, size_to_append);
         offsets.push_back(new_size);
+        sanity_check_simple();
     }
 
     bool is_column_string64() const override { return sizeof(T) == 
sizeof(uint64_t); }
@@ -186,6 +188,7 @@ public:
                                                      size_to_append);
             offsets.push_back(new_size);
         }
+        sanity_check_simple();
     }
 
     void insert_data(const char* pos, size_t length) override {
@@ -198,6 +201,7 @@ public:
             memcpy(chars.data() + old_size, pos, length);
         }
         offsets.push_back(new_size);
+        sanity_check_simple();
     }
 
     void insert_data_without_reserve(const char* pos, size_t length) {
@@ -210,6 +214,7 @@ public:
             memcpy(chars.data() + old_size, pos, length);
         }
         offsets.push_back_without_reserve(new_size);
+        sanity_check_simple();
     }
 
     /// Before insert strings, the caller should calculate the total size of 
strings,
@@ -243,6 +248,7 @@ public:
         }
         check_chars_length(offset, offsets.size());
         chars.resize(offset);
+        sanity_check_simple();
     }
 
     void insert_many_continuous_binary_data(const char* data, const uint32_t* 
offsets_,
@@ -268,6 +274,7 @@ public:
             offsets_ptr[i] = tail_offset + offsets_[i + 1] - begin_offset;
         }
         DCHECK(chars.size() == offsets.back());
+        sanity_check_simple();
     }
 
     void insert_many_binary_data(char* data_array, uint32_t* len_array,
@@ -291,6 +298,7 @@ public:
             offset += len;
             offsets.push_back(offset);
         }
+        sanity_check_simple();
     }
 
     void insert_many_strings(const StringRef* strings, size_t num) override {
@@ -313,6 +321,7 @@ public:
             }
             offsets.push_back(offset);
         }
+        sanity_check_simple();
     }
 
     //    template <typename T, size_t copy_length>
@@ -341,6 +350,7 @@ public:
             offsets.push_back(offset);
         }
         chars.resize(old_size + new_size);
+        sanity_check_simple();
     }
 
     void insert_many_strings_overflow(const StringRef* strings, size_t num,
@@ -382,12 +392,14 @@ public:
             memcpy(chars.data() + old_size, src.data, src.size);
             old_size += src.size;
         }
+        sanity_check_simple();
     }
 
     void pop_back(size_t n) override {
         size_t nested_n = offsets.back() - offset_at(offsets.size() - n);
         chars.resize(chars.size() - nested_n);
         offsets.resize_assume_reserved(offsets.size() - n);
+        sanity_check_simple();
     }
 
     StringRef serialize_value_into_arena(size_t n, Arena& arena, char const*& 
begin) const override;
diff --git a/be/src/vec/functions/date_time_transforms.h 
b/be/src/vec/functions/date_time_transforms.h
index eb1791e9177..96b8738a38b 100644
--- a/be/src/vec/functions/date_time_transforms.h
+++ b/be/src/vec/functions/date_time_transforms.h
@@ -284,6 +284,7 @@ struct TransformerToStringOneArgument {
             res_offsets[i] = Transform::execute(date_time_value, res_data, 
offset);
             null_map[i] = !date_time_value.is_valid_date();
         }
+        res_data.resize(res_offsets[res_offsets.size() - 1]);
     }
 
     static void vector(FunctionContext* context,
@@ -302,6 +303,7 @@ struct TransformerToStringOneArgument {
             res_offsets[i] = Transform::execute(date_time_value, res_data, 
offset);
             DCHECK(date_time_value.is_valid_date());
         }
+        res_data.resize(res_offsets[res_offsets.size() - 1]);
     }
 };
 
diff --git a/be/src/vec/functions/function_ip.h 
b/be/src/vec/functions/function_ip.h
index 99b37c94a52..b75f40bf09d 100644
--- a/be/src/vec/functions/function_ip.h
+++ b/be/src/vec/functions/function_ip.h
@@ -339,6 +339,7 @@ public:
             process_ipv6_column<ColumnString>(column, input_rows_count, 
vec_res, offsets_res,
                                               null_map, ipv6_address_data);
         }
+        vec_res.resize(offsets_res[offsets_res.size() - 1]);
 
         block.replace_by_position(result,
                                   ColumnNullable::create(std::move(col_res), 
std::move(null_map)));
@@ -1261,6 +1262,7 @@ public:
             cut_address(address, pos, bytes_to_cut_count);
             offsets_res[i] = pos - begin;
         }
+        chars_res.resize(offsets_res[offsets_res.size() - 1]);
 
         block.replace_by_position(result, std::move(col_res));
         return Status::OK();
diff --git a/be/src/vec/functions/function_uuid.cpp 
b/be/src/vec/functions/function_uuid.cpp
index cee5fd7a363..bc4dec00705 100644
--- a/be/src/vec/functions/function_uuid.cpp
+++ b/be/src/vec/functions/function_uuid.cpp
@@ -180,6 +180,7 @@ public:
             col_offset[row] = col_offset[row - 1] + str_length;
             deserialize((char*)arg, col_data.data() + str_length * row);
         }
+        col_data.resize(str_length * input_rows_count);
         block.replace_by_position(result, std::move(result_column));
         return Status::OK();
     }


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to