This is an automated email from the ASF dual-hosted git repository. dataroaring pushed a commit to branch branch-3.0 in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-3.0 by this push: new 2b3f8eef6ee [Improvement](column) add santy check and add some fix for ColumnString #47964 (#48513) 2b3f8eef6ee is described below commit 2b3f8eef6ee3050664d739cbb0071be177f9f0b8 Author: Pxl <x...@selectdb.com> AuthorDate: Thu Mar 20 14:32:01 2025 +0800 [Improvement](column) add santy check and add some fix for ColumnString #47964 (#48513) pick part of https://github.com/apache/doris/pull/47964 --- be/src/vec/columns/column_string.cpp | 56 +++++++++++++++++++----- be/src/vec/columns/column_string.h | 12 +++++ be/src/vec/functions/date_time_transforms.h | 2 + be/src/vec/functions/function_decode_varchar.cpp | 2 +- be/src/vec/functions/function_ip.h | 2 + be/src/vec/functions/function_uuid.cpp | 1 + 6 files changed, 62 insertions(+), 13 deletions(-) diff --git a/be/src/vec/columns/column_string.cpp b/be/src/vec/columns/column_string.cpp index 8bcc6565467..b61e28ce3f8 100644 --- a/be/src/vec/columns/column_string.cpp +++ b/be/src/vec/columns/column_string.cpp @@ -36,20 +36,30 @@ namespace doris::vectorized { template <typename T> void ColumnStr<T>::sanity_check() const { - auto count = offsets.size(); +#ifndef NDEBUG + sanity_check_simple(); + auto count = (int)offsets.size(); + for (int i = 0; i < count; ++i) { + if (offsets[i] < offsets[i - 1]) { + throw Exception(Status::InternalError("row count: {}, offsets[{}]: {}, offsets[{}]: {}", + count, i, offsets[i], i - 1, offsets[i - 1])); + } + } +#endif +} + +template <typename T> +void ColumnStr<T>::sanity_check_simple() const { +#ifndef NDEBUG + auto count = (int)offsets.size(); if (chars.size() != offsets[count - 1]) { - LOG(FATAL) << "row count: " << count << ", chars.size(): " << chars.size() << ", offset[" - << count - 1 << "]: " << offsets[count - 1]; + throw Exception(Status::InternalError("row count: {}, chars.size(): {}, offset[{}]: {}", + count, chars.size(), count - 1, offsets[count - 1])); } if (offsets[-1] != 0) { - LOG(FATAL) << "wrong offsets[-1]: " << offsets[-1]; - } - for (size_t i = 0; i < count; ++i) { - if (offsets[i] < offsets[i - 1]) { - LOG(FATAL) << "row count: " << count << ", offsets[" << i << "]: " << offsets[i] - << ", offsets[" << i - 1 << "]: " << offsets[i - 1]; - } + throw Exception(Status::InternalError("wrong offsets[-1]: {}", offsets[-1])); } +#endif } template <typename T> @@ -77,6 +87,8 @@ MutableColumnPtr ColumnStr<T>::clone_resized(size_t to_size) const { res->offsets.resize_fill(to_size, chars.size()); } + res->sanity_check_simple(); + return res; } @@ -135,6 +147,14 @@ void ColumnStr<T>::insert_range_from_ignore_overflow(const doris::vectorized::IC src_concrete.offsets[start + i] - nested_offset + prev_max_offset; } } +#ifndef NDEBUG + auto count = int64_t(offsets.size()); + // offsets may overflow, so we make chars.size() as T to do same overflow check + if (offsets.back() != T(chars.size())) { + throw Exception(Status::InternalError("row count: {}, chars.size(): {}, offset[{}]: {}", + count, chars.size(), count - 1, offsets[count - 1])); + } +#endif } template <typename T> @@ -178,6 +198,7 @@ void ColumnStr<T>::insert_range_from(const IColumn& src, size_t start, size_t le } else { do_insert(assert_cast<const ColumnStr<uint32_t>&>(src)); } + sanity_check_simple(); } template <typename T> @@ -244,6 +265,7 @@ void ColumnStr<T>::insert_indices_from(const IColumn& src, const uint32_t* indic } else { do_insert(assert_cast<const ColumnStr<uint32_t>&>(src)); } + sanity_check_simple(); } template <typename T> @@ -297,7 +319,9 @@ size_t ColumnStr<T>::filter(const IColumn::Filter& filter) { } if constexpr (std::is_same_v<UInt32, T>) { - return filter_arrays_impl<UInt8, IColumn::Offset>(chars, offsets, filter); + auto res = filter_arrays_impl<UInt8, IColumn::Offset>(chars, offsets, filter); + sanity_check_simple(); + return res; } else { throw doris::Exception(doris::ErrorCode::INTERNAL_ERROR, "should not call filter in ColumnStr<UInt64>"); @@ -373,7 +397,7 @@ ColumnPtr ColumnStr<T>::permute(const IColumn::Permutation& perm, size_t limit) current_new_offset += string_size; res_offsets[i] = current_new_offset; } - + sanity_check_simple(); return res; } @@ -405,6 +429,7 @@ const char* ColumnStr<T>::deserialize_and_insert_from_arena(const char* pos) { memcpy(chars.data() + old_size, pos, string_size); offsets.push_back(new_size); + sanity_check_simple(); return pos + string_size; } @@ -495,6 +520,7 @@ void ColumnStr<T>::deserialize_vec_with_null_map(std::vector<StringRef>& keys, insert_default(); } } + sanity_check_simple(); } template <typename T> @@ -558,6 +584,7 @@ ColumnPtr ColumnStr<T>::replicate(const IColumn::Offsets& replicate_offsets) con res_chars.resize(res_chars.size() + string_size); memcpy_small_allow_read_write_overflow15(&res_chars[res_chars.size() - string_size], &chars[prev_string_offset], string_size); + check_chars_length(res_chars.size(), res_offsets.size()); } prev_replicate_offset = replicate_offsets[i]; @@ -565,6 +592,7 @@ ColumnPtr ColumnStr<T>::replicate(const IColumn::Offsets& replicate_offsets) con } check_chars_length(res_chars.size(), res_offsets.size()); + sanity_check_simple(); return res; } @@ -572,6 +600,7 @@ template <typename T> void ColumnStr<T>::reserve(size_t n) { offsets.reserve(n); chars.reserve(n); + sanity_check_simple(); } template <typename T> @@ -579,9 +608,11 @@ void ColumnStr<T>::resize(size_t n) { auto origin_size = size(); if (origin_size > n) { offsets.resize(n); + chars.resize(offsets[n - 1]); } else if (origin_size < n) { insert_many_defaults(n - origin_size); } + sanity_check_simple(); } template <typename T> @@ -637,6 +668,7 @@ ColumnPtr ColumnStr<T>::convert_column_if_overflow() { } offsets.clear(); + new_col->sanity_check_simple(); return new_col; } return this->get_ptr(); diff --git a/be/src/vec/columns/column_string.h b/be/src/vec/columns/column_string.h index e696b6f0764..33afabfc576 100644 --- a/be/src/vec/columns/column_string.h +++ b/be/src/vec/columns/column_string.h @@ -108,6 +108,7 @@ public: bool is_variable_length() const override { return true; } // used in string ut testd void sanity_check() const; + void sanity_check_simple() const; const char* get_family_name() const override { return "String"; } size_t size() const override { return offsets.size(); } @@ -162,6 +163,7 @@ public: chars.resize(new_size); memcpy(chars.data() + old_size, s.data, size_to_append); offsets.push_back(new_size); + sanity_check_simple(); } void insert_many_from(const IColumn& src, size_t position, size_t length) override; @@ -188,6 +190,7 @@ public: size_to_append); offsets.push_back(new_size); } + sanity_check_simple(); } void insert_data(const char* pos, size_t length) override { @@ -200,6 +203,7 @@ public: memcpy(chars.data() + old_size, pos, length); } offsets.push_back(new_size); + sanity_check_simple(); } void insert_data_without_reserve(const char* pos, size_t length) { @@ -212,6 +216,7 @@ public: memcpy(chars.data() + old_size, pos, length); } offsets.push_back_without_reserve(new_size); + sanity_check_simple(); } /// Before insert strings, the caller should calculate the total size of strings, @@ -245,6 +250,7 @@ public: } check_chars_length(offset, offsets.size()); chars.resize(offset); + sanity_check_simple(); } void insert_many_continuous_binary_data(const char* data, const uint32_t* offsets_, @@ -270,6 +276,7 @@ public: offsets_ptr[i] = tail_offset + offsets_[i + 1] - begin_offset; } DCHECK(chars.size() == offsets.back()); + sanity_check_simple(); } void insert_many_binary_data(char* data_array, uint32_t* len_array, @@ -293,6 +300,7 @@ public: offset += len; offsets.push_back(offset); } + sanity_check_simple(); } void insert_many_strings(const StringRef* strings, size_t num) override { @@ -315,6 +323,7 @@ public: } offsets.push_back(offset); } + sanity_check_simple(); } template <size_t copy_length> @@ -339,6 +348,7 @@ public: offsets.push_back(offset); } chars.resize(old_size + new_size); + sanity_check_simple(); } void insert_many_strings_overflow(const StringRef* strings, size_t num, @@ -380,12 +390,14 @@ public: memcpy(chars.data() + old_size, src.data, src.size); old_size += src.size; } + sanity_check_simple(); } void pop_back(size_t n) override { size_t nested_n = offsets.back() - offset_at(offsets.size() - n); chars.resize(chars.size() - nested_n); offsets.resize_assume_reserved(offsets.size() - n); + sanity_check_simple(); } StringRef serialize_value_into_arena(size_t n, Arena& arena, char const*& begin) const override; diff --git a/be/src/vec/functions/date_time_transforms.h b/be/src/vec/functions/date_time_transforms.h index 73155afae3a..33accd9f706 100644 --- a/be/src/vec/functions/date_time_transforms.h +++ b/be/src/vec/functions/date_time_transforms.h @@ -316,6 +316,7 @@ struct TransformerToStringOneArgument { res_offsets[i] = Transform::execute(date_time_value, res_data, offset); null_map[i] = !date_time_value.is_valid_date(); } + res_data.resize(res_offsets[res_offsets.size() - 1]); } static void vector(FunctionContext* context, @@ -334,6 +335,7 @@ struct TransformerToStringOneArgument { res_offsets[i] = Transform::execute(date_time_value, res_data, offset); DCHECK(date_time_value.is_valid_date()); } + res_data.resize(res_offsets[res_offsets.size() - 1]); } }; diff --git a/be/src/vec/functions/function_decode_varchar.cpp b/be/src/vec/functions/function_decode_varchar.cpp index 59f7ecfac04..c4aabc92eb1 100644 --- a/be/src/vec/functions/function_decode_varchar.cpp +++ b/be/src/vec/functions/function_decode_varchar.cpp @@ -106,7 +106,7 @@ public: simd::reverse_copy_bytes(col_res_data.data() + col_res_offset[i - 1], str_size, ui8_ptr + sizeof(IntegerType) - str_size, str_size); } - + col_res_data.resize(col_res_offset[col_res_offset.size() - 1]); block.get_by_position(result).column = std::move(col_res); return Status::OK(); diff --git a/be/src/vec/functions/function_ip.h b/be/src/vec/functions/function_ip.h index 00b43955372..9c4737c84a7 100644 --- a/be/src/vec/functions/function_ip.h +++ b/be/src/vec/functions/function_ip.h @@ -335,6 +335,7 @@ public: process_ipv6_column<ColumnString>(column, input_rows_count, vec_res, offsets_res, null_map, ipv6_address_data); } + vec_res.resize(offsets_res[offsets_res.size() - 1]); block.replace_by_position(result, ColumnNullable::create(std::move(col_res), std::move(null_map))); @@ -1388,6 +1389,7 @@ public: cut_address(address, pos, bytes_to_cut_count); offsets_res[i] = pos - begin; } + chars_res.resize(offsets_res[offsets_res.size() - 1]); block.replace_by_position(result, std::move(col_res)); return Status::OK(); diff --git a/be/src/vec/functions/function_uuid.cpp b/be/src/vec/functions/function_uuid.cpp index cee5fd7a363..bc4dec00705 100644 --- a/be/src/vec/functions/function_uuid.cpp +++ b/be/src/vec/functions/function_uuid.cpp @@ -180,6 +180,7 @@ public: col_offset[row] = col_offset[row - 1] + str_length; deserialize((char*)arg, col_data.data() + str_length * row); } + col_data.resize(str_length * input_rows_count); block.replace_by_position(result, std::move(result_column)); return Status::OK(); } --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org