This is an automated email from the ASF dual-hosted git repository. yiguolei pushed a commit to branch branch-2.1 in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-2.1 by this push: new 548c79f3362 [Improvement](column) add santy check and add some fix for ColumnString #47964 (#48512) 548c79f3362 is described below commit 548c79f33620024676f0610665e8966e0a69e0f0 Author: Pxl <x...@selectdb.com> AuthorDate: Tue Mar 4 21:47:16 2025 +0800 [Improvement](column) add santy check and add some fix for ColumnString #47964 (#48512) pick part of #47964 --- be/src/vec/columns/column_string.cpp | 33 +++++++++++++++++++++++++++-- be/src/vec/columns/column_string.h | 12 +++++++++++ be/src/vec/functions/date_time_transforms.h | 2 ++ be/src/vec/functions/function_ip.h | 2 ++ be/src/vec/functions/function_uuid.cpp | 1 + 5 files changed, 48 insertions(+), 2 deletions(-) diff --git a/be/src/vec/columns/column_string.cpp b/be/src/vec/columns/column_string.cpp index c085fdc5493..8cdc44da02a 100644 --- a/be/src/vec/columns/column_string.cpp +++ b/be/src/vec/columns/column_string.cpp @@ -51,6 +51,20 @@ void ColumnStr<T>::sanity_check() const { } } +template <typename T> +void ColumnStr<T>::sanity_check_simple() const { +#ifndef NDEBUG + auto count = offsets.size(); + if (chars.size() != offsets[count - 1]) { + throw Exception(Status::InternalError("row count: {}, chars.size(): {}, offset[{}]: {}", + count, chars.size(), count - 1, offsets[count - 1])); + } + if (offsets[-1] != 0) { + throw Exception(Status::InternalError("wrong offsets[-1]: {}", offsets[-1])); + } +#endif +} + template <typename T> MutableColumnPtr ColumnStr<T>::clone_resized(size_t to_size) const { auto res = ColumnStr<T>::create(); @@ -76,6 +90,8 @@ MutableColumnPtr ColumnStr<T>::clone_resized(size_t to_size) const { res->offsets.resize_fill(to_size, chars.size()); } + res->sanity_check_simple(); + return res; } @@ -134,6 +150,7 @@ void ColumnStr<T>::insert_range_from_ignore_overflow(const doris::vectorized::IC src_concrete.offsets[start + i] - nested_offset + prev_max_offset; } } + sanity_check_simple(); } template <typename T> @@ -177,6 +194,7 @@ void ColumnStr<T>::insert_range_from(const IColumn& src, size_t start, size_t le } else { do_insert(assert_cast<const ColumnStr<uint32_t>&>(src)); } + sanity_check_simple(); } template <typename T> @@ -220,6 +238,7 @@ void ColumnStr<T>::insert_indices_from(const IColumn& src, const uint32_t* indic } else { do_insert(assert_cast<const ColumnStr<uint32_t>&>(src)); } + sanity_check_simple(); } template <typename T> @@ -273,7 +292,9 @@ size_t ColumnStr<T>::filter(const IColumn::Filter& filter) { } if constexpr (std::is_same_v<UInt32, T>) { - return filter_arrays_impl<UInt8, IColumn::Offset>(chars, offsets, filter); + auto res = filter_arrays_impl<UInt8, IColumn::Offset>(chars, offsets, filter); + sanity_check_simple(); + return res; } else { throw doris::Exception(doris::ErrorCode::INTERNAL_ERROR, "should not call filter in ColumnStr<UInt64>"); @@ -349,7 +370,7 @@ ColumnPtr ColumnStr<T>::permute(const IColumn::Permutation& perm, size_t limit) current_new_offset += string_size; res_offsets[i] = current_new_offset; } - + sanity_check_simple(); return res; } @@ -381,6 +402,7 @@ const char* ColumnStr<T>::deserialize_and_insert_from_arena(const char* pos) { memcpy(chars.data() + old_size, pos, string_size); offsets.push_back(new_size); + sanity_check_simple(); return pos + string_size; } @@ -446,6 +468,7 @@ void ColumnStr<T>::deserialize_vec_with_null_map(std::vector<StringRef>& keys, insert_default(); } } + sanity_check_simple(); } template <typename T> @@ -558,6 +581,7 @@ ColumnPtr ColumnStr<T>::replicate(const IColumn::Offsets& replicate_offsets) con res_chars.resize(res_chars.size() + string_size); memcpy_small_allow_read_write_overflow15(&res_chars[res_chars.size() - string_size], &chars[prev_string_offset], string_size); + check_chars_length(res_chars.size(), res_offsets.size()); } prev_replicate_offset = replicate_offsets[i]; @@ -565,6 +589,7 @@ ColumnPtr ColumnStr<T>::replicate(const IColumn::Offsets& replicate_offsets) con } check_chars_length(res_chars.size(), res_offsets.size()); + sanity_check_simple(); return res; } @@ -572,6 +597,7 @@ template <typename T> void ColumnStr<T>::reserve(size_t n) { offsets.reserve(n); chars.reserve(n); + sanity_check_simple(); } template <typename T> @@ -579,9 +605,11 @@ void ColumnStr<T>::resize(size_t n) { auto origin_size = size(); if (origin_size > n) { offsets.resize(n); + chars.resize(offsets[n - 1]); } else if (origin_size < n) { insert_many_defaults(n - origin_size); } + sanity_check_simple(); } template <typename T> @@ -645,6 +673,7 @@ ColumnPtr ColumnStr<T>::convert_column_if_overflow() { } offsets.clear(); + new_col->sanity_check_simple(); return new_col; } return this->get_ptr(); diff --git a/be/src/vec/columns/column_string.h b/be/src/vec/columns/column_string.h index 5fe8d298818..9106e67b2b5 100644 --- a/be/src/vec/columns/column_string.h +++ b/be/src/vec/columns/column_string.h @@ -108,6 +108,7 @@ public: bool is_variable_length() const override { return true; } // used in string ut testd void sanity_check() const; + void sanity_check_simple() const; const char* get_family_name() const override { return "String"; } size_t size() const override { return offsets.size(); } @@ -162,6 +163,7 @@ public: chars.resize(new_size); memcpy(chars.data() + old_size, s.data, size_to_append); offsets.push_back(new_size); + sanity_check_simple(); } bool is_column_string64() const override { return sizeof(T) == sizeof(uint64_t); } @@ -186,6 +188,7 @@ public: size_to_append); offsets.push_back(new_size); } + sanity_check_simple(); } void insert_data(const char* pos, size_t length) override { @@ -198,6 +201,7 @@ public: memcpy(chars.data() + old_size, pos, length); } offsets.push_back(new_size); + sanity_check_simple(); } void insert_data_without_reserve(const char* pos, size_t length) { @@ -210,6 +214,7 @@ public: memcpy(chars.data() + old_size, pos, length); } offsets.push_back_without_reserve(new_size); + sanity_check_simple(); } /// Before insert strings, the caller should calculate the total size of strings, @@ -243,6 +248,7 @@ public: } check_chars_length(offset, offsets.size()); chars.resize(offset); + sanity_check_simple(); } void insert_many_continuous_binary_data(const char* data, const uint32_t* offsets_, @@ -268,6 +274,7 @@ public: offsets_ptr[i] = tail_offset + offsets_[i + 1] - begin_offset; } DCHECK(chars.size() == offsets.back()); + sanity_check_simple(); } void insert_many_binary_data(char* data_array, uint32_t* len_array, @@ -291,6 +298,7 @@ public: offset += len; offsets.push_back(offset); } + sanity_check_simple(); } void insert_many_strings(const StringRef* strings, size_t num) override { @@ -313,6 +321,7 @@ public: } offsets.push_back(offset); } + sanity_check_simple(); } // template <typename T, size_t copy_length> @@ -341,6 +350,7 @@ public: offsets.push_back(offset); } chars.resize(old_size + new_size); + sanity_check_simple(); } void insert_many_strings_overflow(const StringRef* strings, size_t num, @@ -382,12 +392,14 @@ public: memcpy(chars.data() + old_size, src.data, src.size); old_size += src.size; } + sanity_check_simple(); } void pop_back(size_t n) override { size_t nested_n = offsets.back() - offset_at(offsets.size() - n); chars.resize(chars.size() - nested_n); offsets.resize_assume_reserved(offsets.size() - n); + sanity_check_simple(); } StringRef serialize_value_into_arena(size_t n, Arena& arena, char const*& begin) const override; diff --git a/be/src/vec/functions/date_time_transforms.h b/be/src/vec/functions/date_time_transforms.h index eb1791e9177..96b8738a38b 100644 --- a/be/src/vec/functions/date_time_transforms.h +++ b/be/src/vec/functions/date_time_transforms.h @@ -284,6 +284,7 @@ struct TransformerToStringOneArgument { res_offsets[i] = Transform::execute(date_time_value, res_data, offset); null_map[i] = !date_time_value.is_valid_date(); } + res_data.resize(res_offsets[res_offsets.size() - 1]); } static void vector(FunctionContext* context, @@ -302,6 +303,7 @@ struct TransformerToStringOneArgument { res_offsets[i] = Transform::execute(date_time_value, res_data, offset); DCHECK(date_time_value.is_valid_date()); } + res_data.resize(res_offsets[res_offsets.size() - 1]); } }; diff --git a/be/src/vec/functions/function_ip.h b/be/src/vec/functions/function_ip.h index 99b37c94a52..b75f40bf09d 100644 --- a/be/src/vec/functions/function_ip.h +++ b/be/src/vec/functions/function_ip.h @@ -339,6 +339,7 @@ public: process_ipv6_column<ColumnString>(column, input_rows_count, vec_res, offsets_res, null_map, ipv6_address_data); } + vec_res.resize(offsets_res[offsets_res.size() - 1]); block.replace_by_position(result, ColumnNullable::create(std::move(col_res), std::move(null_map))); @@ -1261,6 +1262,7 @@ public: cut_address(address, pos, bytes_to_cut_count); offsets_res[i] = pos - begin; } + chars_res.resize(offsets_res[offsets_res.size() - 1]); block.replace_by_position(result, std::move(col_res)); return Status::OK(); diff --git a/be/src/vec/functions/function_uuid.cpp b/be/src/vec/functions/function_uuid.cpp index cee5fd7a363..bc4dec00705 100644 --- a/be/src/vec/functions/function_uuid.cpp +++ b/be/src/vec/functions/function_uuid.cpp @@ -180,6 +180,7 @@ public: col_offset[row] = col_offset[row - 1] + str_length; deserialize((char*)arg, col_data.data() + str_length * row); } + col_data.resize(str_length * input_rows_count); block.replace_by_position(result, std::move(result_column)); return Status::OK(); } --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org