This is an automated email from the ASF dual-hosted git repository. yiguolei pushed a commit to branch branch-2.0 in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-2.0 by this push: new 41b5aeb1c5 [fix](concat) ColumnString::chars is resized with wrong size (#22610) 41b5aeb1c5 is described below commit 41b5aeb1c5332e41dca42ac8bd6558ed5057b184 Author: TengJianPing <18241664+jackte...@users.noreply.github.com> AuthorDate: Fri Aug 4 19:13:35 2023 +0800 [fix](concat) ColumnString::chars is resized with wrong size (#22610) FunctionStringConcat::execute_impl resized with size that include string null terminator, which causes ColumnString::chars.size() does not match with ColumnString::offsets.back, this will cause problems for some string functions, e.g. like and regexp. --- be/src/vec/columns/column_string.cpp | 17 ++++++++++ be/src/vec/columns/column_string.h | 2 ++ be/src/vec/functions/function_string.h | 2 -- be/test/vec/core/column_string_test.cpp | 59 +++++++++++++++++++++++++++++++++ 4 files changed, 78 insertions(+), 2 deletions(-) diff --git a/be/src/vec/columns/column_string.cpp b/be/src/vec/columns/column_string.cpp index ed3cd28be9..5d2670acb7 100644 --- a/be/src/vec/columns/column_string.cpp +++ b/be/src/vec/columns/column_string.cpp @@ -35,6 +35,23 @@ namespace doris::vectorized { +void ColumnString::sanity_check() const { + auto count = offsets.size(); + if (chars.size() != offsets[count - 1]) { + LOG(FATAL) << "row count: " << count << ", chars.size(): " << chars.size() << ", offset[" + << count - 1 << "]: " << offsets[count - 1]; + } + if (offsets[-1] != 0) { + LOG(FATAL) << "wrong offsets[-1]: " << offsets[-1]; + } + for (size_t i = 0; i < count; ++i) { + if (offsets[i] < offsets[i - 1]) { + LOG(FATAL) << "row count: " << count << ", offsets[" << i << "]: " << offsets[i] + << ", offsets[" << i - 1 << "]: " << offsets[i - 1]; + } + } +} + MutableColumnPtr ColumnString::clone_resized(size_t to_size) const { auto res = ColumnString::create(); if (to_size == 0) { diff --git a/be/src/vec/columns/column_string.h b/be/src/vec/columns/column_string.h index 63ebeb4686..26a7093140 100644 --- a/be/src/vec/columns/column_string.h +++ b/be/src/vec/columns/column_string.h @@ -106,6 +106,8 @@ private: chars(src.chars.begin(), src.chars.end()) {} public: + void sanity_check() const; + const char* get_family_name() const override { return "String"; } size_t size() const override { return offsets.size(); } diff --git a/be/src/vec/functions/function_string.h b/be/src/vec/functions/function_string.h index 83f98d726a..32e373ffa0 100644 --- a/be/src/vec/functions/function_string.h +++ b/be/src/vec/functions/function_string.h @@ -776,8 +776,6 @@ public: if ((UNLIKELY(UINT_MAX - input_rows_count < res_reserve_size))) { return Status::BufferAllocFailed("concat output is too large to allocate"); } - // for each terminal zero - res_reserve_size += input_rows_count; res_data.resize(res_reserve_size); diff --git a/be/test/vec/core/column_string_test.cpp b/be/test/vec/core/column_string_test.cpp new file mode 100644 index 0000000000..81f41bd11c --- /dev/null +++ b/be/test/vec/core/column_string_test.cpp @@ -0,0 +1,59 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "vec/columns/column_string.h" + +#include <gtest/gtest.h> + +#include "vec/core/block.h" +#include "vec/data_types/data_type_string.h" +#include "vec/functions/function_string.h" + +namespace doris::vectorized { +TEST(ColumnStringTest, TestConcat) { + Block block; + vectorized::DataTypePtr str_type = std::make_shared<vectorized::DataTypeString>(); + + auto str_col0 = ColumnString::create(); + std::vector<std::string> vals0 = {"aaa", "bb", "cccc"}; + for (auto& v : vals0) { + str_col0->insert_data(v.data(), v.size()); + } + block.insert({std::move(str_col0), str_type, "test_str_col0"}); + + auto str_col1 = ColumnString::create(); + std::vector<std::string> vals1 = {"3", "2", "4"}; + for (auto& v : vals1) { + str_col1->insert_data(v.data(), v.size()); + } + block.insert({std::move(str_col1), str_type, "test_str_col1"}); + + auto str_col_res = ColumnString::create(); + block.insert({std::move(str_col_res), str_type, "test_str_res"}); + + ColumnNumbers arguments = {0, 1}; + + FunctionStringConcat func_concat; + auto status = func_concat.execute_impl(nullptr, block, arguments, 2, 3); + EXPECT_TRUE(status.ok()); + + auto actual_res_col = block.get_by_position(2).column; + EXPECT_EQ(actual_res_col->size(), 3); + auto actual_res_col_str = assert_cast<const ColumnString*>(actual_res_col.get()); + actual_res_col_str->sanity_check(); +} +} // namespace doris::vectorized \ No newline at end of file --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org