zclllyybb commented on code in PR #48054: URL: https://github.com/apache/doris/pull/48054#discussion_r2021371638
########## be/test/vec/utils/stringop_substring_test.cpp: ########## @@ -0,0 +1,228 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "vec/utils/stringop_substring.h" + +#include <gtest/gtest.h> + +#include <cstdint> +#include <string> +#include <vector> + +#include "common/logging.h" +#include "runtime/primitive_type.h" +#include "testutil/column_helper.h" // ColumnHelper is used for constructing columns and Block +#include "vec/columns/column_nullable.h" +#include "vec/columns/column_string.h" +#include "vec/core/block.h" +#include "vec/data_types/data_type_number.h" +#include "vec/data_types/data_type_string.h" + +namespace doris::vectorized { + +TEST(StringOPTest, testStringPushOperations) { + // Create expected result column with various string types + std::vector<std::string> expected_strings = { + "", // Empty string + "abc", // Simple ASCII string + "中文测试", // UTF-8 multi-byte string + "", // NULL marker + " ", // Whitespace only + "!@#$%^&*()", // Special characters + "123456789012345678901234567890", // Medium-length string + std::string(10000, 'x'), // Very long string + "混合English和中文", // Mixed language + "包含\n换行\t制表符", // Contains escape characters + "末尾有空格 " // Trailing spaces + }; + + std::vector<uint8_t> null_flags = {false, false, false, true, false, false, + false, false, false, false, false}; // The fourth one is NULL + + auto expected_col = + ColumnHelper::create_nullable_column<DataTypeString>(expected_strings, null_flags); + + auto test_col = ColumnHelper::create_nullable_column<DataTypeString>({}, {}); + auto* test_column_nullable = dynamic_cast<const ColumnNullable*>(test_col.get()); + auto& test_nested_column = const_cast<ColumnString&>( + static_cast<const ColumnString&>(test_column_nullable->get_nested_column())); + auto& test_chars = test_nested_column.get_chars(); + auto& test_offsets = test_nested_column.get_offsets(); + test_offsets.resize(expected_strings.size()); + NullMap test_null_map(expected_strings.size(), false); + + // Use loop and conditions to select different push functions to fill the test column + for (size_t i = 0; i < expected_strings.size(); ++i) { + if (null_flags[i]) { + // Use push_null_string for NULL values + StringOP::push_null_string(i, test_chars, test_offsets, test_null_map); + } else if (expected_strings[i].empty()) { + // Use push_empty_string for empty strings + StringOP::push_empty_string(i, test_chars, test_offsets); + } else { + // Use push_value_string for normal strings + StringOP::push_value_string(expected_strings[i], i, test_chars, test_offsets); + } + } + + for (size_t i = 0; i < expected_strings.size(); ++i) { + if (null_flags[i]) { + continue; // Skip content validation for NULL values + } + ASSERT_EQ(static_cast<bool>(test_null_map[i]), static_cast<bool>(null_flags[i])) + << "Row " << i << " expected to be non-null."; + + size_t row_length = test_offsets[i] - (i == 0 ? 0 : test_offsets[i - 1]); + ASSERT_EQ(row_length, expected_strings[i].size()) + << "Row " << i << " length mismatch: " << row_length << " vs " + << expected_strings[i].size(); + + std::string actual(reinterpret_cast<const char*>(test_chars.data() + + (i == 0 ? 0 : test_offsets[i - 1])), + row_length); + ASSERT_EQ(actual, expected_strings[i]) << "Row " << i << " content mismatch."; + } +} + +TEST(StringOPTest, testPushValueStringReservedAndAllowOverFlow) { + // Create expected result column with various string types + std::vector<std::string> expected_strings = {"", "abc", "中文测试", "", + " ", "!@#$%^&*()", "1234567890", "xxxxxx", + "a", " ", "!@#$%^&*()"}; + std::vector<uint8_t> null_flags = {false, false, false, true, false, false, + false, false, false, false, false}; // The fourth one is NULL + + auto expected_col = + ColumnHelper::create_nullable_column<DataTypeString>(expected_strings, null_flags); + + auto test_col = ColumnHelper::create_nullable_column<DataTypeString>({}, {}); + auto* test_column_nullable = dynamic_cast<const ColumnNullable*>(test_col.get()); + auto& test_nested_column = const_cast<ColumnString&>( + static_cast<const ColumnString&>(test_column_nullable->get_nested_column())); + auto& test_chars = test_nested_column.get_chars(); + auto& test_offsets = test_nested_column.get_offsets(); + test_offsets.resize(expected_strings.size()); + NullMap test_null_map(expected_strings.size(), false); + + // Calculate total length of all strings for reserving space + size_t total_length = 0; + for (const auto& str : expected_strings) { + total_length += str.size(); + } + test_chars.reserve(total_length); + for (size_t i = 0; i < expected_strings.size(); ++i) { + if (null_flags[i]) { + // Use push_null_string for NULL values + StringOP::push_null_string(i, test_chars, test_offsets, test_null_map); + } else if (expected_strings[i].empty()) { + // Use push_empty_string for empty strings + StringOP::push_empty_string(i, test_chars, test_offsets); + } else { + // Reserve all space at once + StringOP::push_value_string_reserved_and_allow_overflow(expected_strings[i], i, + test_chars, test_offsets); + } + } + + for (size_t i = 0; i < expected_strings.size(); ++i) { + if (null_flags[i]) { + continue; // Skip content validation for NULL values + } + ASSERT_EQ(static_cast<bool>(test_null_map[i]), static_cast<bool>(null_flags[i])) + << "Row " << i << " expected to be non-null."; + + size_t row_length = test_offsets[i] - (i == 0 ? 0 : test_offsets[i - 1]); + ASSERT_EQ(row_length, expected_strings[i].size()) + << "Row " << i << " length mismatch: " << row_length << " vs " + << expected_strings[i].size(); + + std::string actual(reinterpret_cast<const char*>(test_chars.data() + + (i == 0 ? 0 : test_offsets[i - 1])), + row_length); + ASSERT_EQ(actual, expected_strings[i]) << "Row " << i << " content mismatch."; + } +} +TEST(StringOPTest, testFastRepeat) { + const std::string src = "example"; + { + int32_t repeat_times = 0; + // Allocate enough buffer (when repeat_times is 0, the size is 0) + std::vector<uint8_t> dst(src.size() * repeat_times); + StringOP::fast_repeat(dst.data(), reinterpret_cast<const uint8_t*>(src.data()), src.size(), + repeat_times); + // dst length is 0, no content written + ASSERT_EQ(dst.size(), 0); + } + + { + for (int32_t repeat_times = 1; repeat_times <= 10; ++repeat_times) { + std::vector<uint8_t> dst(src.size() * repeat_times); + StringOP::fast_repeat(dst.data(), reinterpret_cast<const uint8_t*>(src.data()), + src.size(), repeat_times); + std::string result(reinterpret_cast<const char*>(dst.data()), dst.size()); + std::string expected; + for (int i = 0; i < repeat_times; ++i) { + expected += src; + } + ASSERT_EQ(result, expected) << "Failed for repeat_times = " << repeat_times; + } + } +} + +TEST(StringOPTest, testSubstringExecute) { + std::vector<std::tuple<int32_t, int32_t>> test_cases = { + {0, 0}, // Zero parameters + {1, 5}, // Positive start position + {-1, 3}, // Negative start position + {2, -1} // Negative length + }; + + for (const auto& [start_val, len_val] : test_cases) { + std::vector<std::string> input_strings = {""}; Review Comment: 为啥只有一个空串?似乎测不到东西啊 ########## be/test/vec/utils/stringop_substring_test.cpp: ########## @@ -0,0 +1,228 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "vec/utils/stringop_substring.h" + +#include <gtest/gtest.h> + +#include <cstdint> +#include <string> +#include <vector> + +#include "common/logging.h" +#include "runtime/primitive_type.h" +#include "testutil/column_helper.h" // ColumnHelper is used for constructing columns and Block +#include "vec/columns/column_nullable.h" +#include "vec/columns/column_string.h" +#include "vec/core/block.h" +#include "vec/data_types/data_type_number.h" +#include "vec/data_types/data_type_string.h" + +namespace doris::vectorized { + +TEST(StringOPTest, testStringPushOperations) { + // Create expected result column with various string types + std::vector<std::string> expected_strings = { + "", // Empty string + "abc", // Simple ASCII string + "中文测试", // UTF-8 multi-byte string + "", // NULL marker + " ", // Whitespace only + "!@#$%^&*()", // Special characters + "123456789012345678901234567890", // Medium-length string + std::string(10000, 'x'), // Very long string + "混合English和中文", // Mixed language + "包含\n换行\t制表符", // Contains escape characters + "末尾有空格 " // Trailing spaces + }; + + std::vector<uint8_t> null_flags = {false, false, false, true, false, false, + false, false, false, false, false}; // The fourth one is NULL + + auto expected_col = + ColumnHelper::create_nullable_column<DataTypeString>(expected_strings, null_flags); + + auto test_col = ColumnHelper::create_nullable_column<DataTypeString>({}, {}); + auto* test_column_nullable = dynamic_cast<const ColumnNullable*>(test_col.get()); Review Comment: 这个地方是不是static_cast就行了 ########## be/test/vec/utils/stringop_substring_test.cpp: ########## @@ -0,0 +1,228 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "vec/utils/stringop_substring.h" + +#include <gtest/gtest.h> + +#include <cstdint> +#include <string> +#include <vector> + +#include "common/logging.h" +#include "runtime/primitive_type.h" +#include "testutil/column_helper.h" // ColumnHelper is used for constructing columns and Block +#include "vec/columns/column_nullable.h" +#include "vec/columns/column_string.h" +#include "vec/core/block.h" +#include "vec/data_types/data_type_number.h" +#include "vec/data_types/data_type_string.h" + +namespace doris::vectorized { + +TEST(StringOPTest, testStringPushOperations) { + // Create expected result column with various string types + std::vector<std::string> expected_strings = { + "", // Empty string + "abc", // Simple ASCII string + "中文测试", // UTF-8 multi-byte string + "", // NULL marker + " ", // Whitespace only + "!@#$%^&*()", // Special characters + "123456789012345678901234567890", // Medium-length string + std::string(10000, 'x'), // Very long string + "混合English和中文", // Mixed language + "包含\n换行\t制表符", // Contains escape characters + "末尾有空格 " // Trailing spaces + }; + + std::vector<uint8_t> null_flags = {false, false, false, true, false, false, + false, false, false, false, false}; // The fourth one is NULL + + auto expected_col = + ColumnHelper::create_nullable_column<DataTypeString>(expected_strings, null_flags); + + auto test_col = ColumnHelper::create_nullable_column<DataTypeString>({}, {}); + auto* test_column_nullable = dynamic_cast<const ColumnNullable*>(test_col.get()); + auto& test_nested_column = const_cast<ColumnString&>( + static_cast<const ColumnString&>(test_column_nullable->get_nested_column())); + auto& test_chars = test_nested_column.get_chars(); + auto& test_offsets = test_nested_column.get_offsets(); + test_offsets.resize(expected_strings.size()); + NullMap test_null_map(expected_strings.size(), false); + + // Use loop and conditions to select different push functions to fill the test column + for (size_t i = 0; i < expected_strings.size(); ++i) { + if (null_flags[i]) { + // Use push_null_string for NULL values + StringOP::push_null_string(i, test_chars, test_offsets, test_null_map); + } else if (expected_strings[i].empty()) { + // Use push_empty_string for empty strings + StringOP::push_empty_string(i, test_chars, test_offsets); + } else { + // Use push_value_string for normal strings + StringOP::push_value_string(expected_strings[i], i, test_chars, test_offsets); + } + } + + for (size_t i = 0; i < expected_strings.size(); ++i) { + if (null_flags[i]) { + continue; // Skip content validation for NULL values + } + ASSERT_EQ(static_cast<bool>(test_null_map[i]), static_cast<bool>(null_flags[i])) + << "Row " << i << " expected to be non-null."; + + size_t row_length = test_offsets[i] - (i == 0 ? 0 : test_offsets[i - 1]); + ASSERT_EQ(row_length, expected_strings[i].size()) + << "Row " << i << " length mismatch: " << row_length << " vs " + << expected_strings[i].size(); + + std::string actual(reinterpret_cast<const char*>(test_chars.data() + + (i == 0 ? 0 : test_offsets[i - 1])), Review Comment: 这个地方应该不需要这个三元表达式?我们的[-1]是有良定义的 ########## be/test/vec/utils/stringop_substring_test.cpp: ########## @@ -0,0 +1,228 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "vec/utils/stringop_substring.h" + +#include <gtest/gtest.h> + +#include <cstdint> +#include <string> +#include <vector> + +#include "common/logging.h" +#include "runtime/primitive_type.h" +#include "testutil/column_helper.h" // ColumnHelper is used for constructing columns and Block +#include "vec/columns/column_nullable.h" +#include "vec/columns/column_string.h" +#include "vec/core/block.h" +#include "vec/data_types/data_type_number.h" +#include "vec/data_types/data_type_string.h" + +namespace doris::vectorized { + +TEST(StringOPTest, testStringPushOperations) { + // Create expected result column with various string types + std::vector<std::string> expected_strings = { + "", // Empty string + "abc", // Simple ASCII string + "中文测试", // UTF-8 multi-byte string + "", // NULL marker + " ", // Whitespace only + "!@#$%^&*()", // Special characters + "123456789012345678901234567890", // Medium-length string + std::string(10000, 'x'), // Very long string + "混合English和中文", // Mixed language + "包含\n换行\t制表符", // Contains escape characters + "末尾有空格 " // Trailing spaces + }; + + std::vector<uint8_t> null_flags = {false, false, false, true, false, false, + false, false, false, false, false}; // The fourth one is NULL + + auto expected_col = + ColumnHelper::create_nullable_column<DataTypeString>(expected_strings, null_flags); + + auto test_col = ColumnHelper::create_nullable_column<DataTypeString>({}, {}); + auto* test_column_nullable = dynamic_cast<const ColumnNullable*>(test_col.get()); + auto& test_nested_column = const_cast<ColumnString&>( + static_cast<const ColumnString&>(test_column_nullable->get_nested_column())); + auto& test_chars = test_nested_column.get_chars(); + auto& test_offsets = test_nested_column.get_offsets(); + test_offsets.resize(expected_strings.size()); + NullMap test_null_map(expected_strings.size(), false); + + // Use loop and conditions to select different push functions to fill the test column + for (size_t i = 0; i < expected_strings.size(); ++i) { + if (null_flags[i]) { + // Use push_null_string for NULL values + StringOP::push_null_string(i, test_chars, test_offsets, test_null_map); + } else if (expected_strings[i].empty()) { + // Use push_empty_string for empty strings + StringOP::push_empty_string(i, test_chars, test_offsets); + } else { + // Use push_value_string for normal strings + StringOP::push_value_string(expected_strings[i], i, test_chars, test_offsets); + } + } + + for (size_t i = 0; i < expected_strings.size(); ++i) { + if (null_flags[i]) { + continue; // Skip content validation for NULL values + } + ASSERT_EQ(static_cast<bool>(test_null_map[i]), static_cast<bool>(null_flags[i])) + << "Row " << i << " expected to be non-null."; + + size_t row_length = test_offsets[i] - (i == 0 ? 0 : test_offsets[i - 1]); + ASSERT_EQ(row_length, expected_strings[i].size()) + << "Row " << i << " length mismatch: " << row_length << " vs " + << expected_strings[i].size(); + + std::string actual(reinterpret_cast<const char*>(test_chars.data() + Review Comment: 似乎没必要用recast?static_cast足够 -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org