amorynan commented on code in PR #42269: URL: https://github.com/apache/doris/pull/42269#discussion_r1870515229
########## be/test/vec/columns/common_column_test.h: ########## @@ -0,0 +1,766 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include <gtest/gtest-message.h> +#include <gtest/gtest-test-part.h> +#include <gtest/gtest.h> + +#include "olap/schema.h" +#include "vec/columns/column.h" +#include "vec/columns/column_array.h" +#include "vec/columns/column_map.h" +#include "vec/columns/columns_number.h" +#include "vec/core/field.h" +#include "vec/core/sort_block.h" +#include "vec/core/sort_description.h" +#include "vec/core/types.h" +#include "vec/data_types/data_type.h" +#include "vec/data_types/data_type_array.h" +#include "vec/data_types/data_type_map.h" + +// this test is gonna to be a column test template for all column which should make ut test to coverage the function defined in column +// for example column_array should test this function: +// size, reserve, resize, empty, byte_size, allocated_bytes, clone_resized, +// get_shrinked_column, filter, filter_by_selector, serialize_vec, deserialize_vec, get_max_row_byte_size +// +namespace doris::vectorized { + +class CommonColumnTest : public ::testing::Test { +public: + void SetUp() override { + col_str = ColumnString::create(); + col_str->insert_data("aaa", 3); + col_str->insert_data("bb", 2); + col_str->insert_data("cccc", 4); + + col_int = ColumnInt64::create(); + col_int->insert_value(1); + col_int->insert_value(2); + col_int->insert_value(3); + + col_dcm = ColumnDecimal64::create(0, 3); + col_dcm->insert_value(1.23); + col_dcm->insert_value(4.56); + col_dcm->insert_value(7.89); + + col_arr = ColumnArray::create(ColumnInt64::create(), ColumnArray::ColumnOffsets::create()); + Array array1 = {1, 2, 3}; + Array array2 = {4}; + col_arr->insert(array1); + col_arr->insert(Array()); + col_arr->insert(array2); + + col_map = ColumnMap::create(ColumnString::create(), ColumnInt64::create(), + ColumnArray::ColumnOffsets::create()); + Array k1 = {"a", "b", "c"}; + Array v1 = {1, 2, 3}; + Array k2 = {"d"}; + Array v2 = {4}; + Array a = Array(); + Map map1, map2, map3; + map1.push_back(k1); + map1.push_back(v1); + col_map->insert(map1); + map3.push_back(a); + map3.push_back(a); + col_map->insert(map3); + map2.push_back(k2); + map2.push_back(v2); + col_map->insert(map2); + } + + ColumnString::MutablePtr col_str; + ColumnInt64::MutablePtr col_int; + ColumnDecimal64::MutablePtr col_dcm; + ColumnArray::MutablePtr col_arr; + ColumnMap::MutablePtr col_map; + + void checkColumn(const IColumn& col1, const IColumn& col2, const IDataType& dataType, + size_t column_size) { + if (WhichDataType(dataType).is_map()) { + auto map1 = check_and_get_column<ColumnMap>(col1); + auto map2 = check_and_get_column<ColumnMap>(col2); + const DataTypeMap& rhs_map = static_cast<const DataTypeMap&>(dataType); + checkColumn(map1->get_keys(), map2->get_keys(), *rhs_map.get_key_type(), + map1->get_keys().size()); + checkColumn(map2->get_values(), map2->get_values(), *rhs_map.get_value_type(), + map1->get_values().size()); + } else { + if (WhichDataType(dataType).is_int8()) { + auto c1 = check_and_get_column<ColumnInt8>(col1); + auto c2 = check_and_get_column<ColumnInt8>(col2); + for (size_t i = 0; i < column_size; ++i) { + EXPECT_EQ(c1->get_element(i), c2->get_element(i)); + } + } else if (WhichDataType(dataType).is_int16()) { + auto c1 = check_and_get_column<ColumnInt16>(col1); + auto c2 = check_and_get_column<ColumnInt16>(col2); + for (size_t i = 0; i < column_size; ++i) { + EXPECT_EQ(c1->get_element(i), c2->get_element(i)); + } + } else if (WhichDataType(dataType).is_int32()) { + auto c1 = check_and_get_column<ColumnInt32>(col1); + auto c2 = check_and_get_column<ColumnInt32>(col2); + for (size_t i = 0; i < column_size; ++i) { + EXPECT_EQ(c1->get_element(i), c2->get_element(i)); + } + } else if (WhichDataType(dataType).is_int64()) { + auto c1 = check_and_get_column<ColumnInt64>(col1); + auto c2 = check_and_get_column<ColumnInt64>(col2); + for (size_t i = 0; i < column_size; ++i) { + EXPECT_EQ(c1->get_element(i), c2->get_element(i)); + } + } else if (WhichDataType(dataType).is_int128()) { + auto c1 = check_and_get_column<ColumnInt128>(col1); + auto c2 = check_and_get_column<ColumnInt128>(col2); + for (size_t i = 0; i < column_size; ++i) { + EXPECT_EQ(c1->get_element(i), c2->get_element(i)); + } + } else if (WhichDataType(dataType).is_float32()) { + auto c1 = check_and_get_column<ColumnFloat32>(col1); + auto c2 = check_and_get_column<ColumnFloat32>(col2); + for (size_t i = 0; i < column_size; ++i) { + EXPECT_EQ(c1->get_element(i), c2->get_element(i)); + } + } else if (WhichDataType(dataType).is_float64()) { + auto c1 = check_and_get_column<ColumnFloat64>(col1); + auto c2 = check_and_get_column<ColumnFloat64>(col2); + for (size_t i = 0; i < column_size; ++i) { + EXPECT_EQ(c1->get_element(i), c2->get_element(i)); + } + } else if (WhichDataType(dataType).is_uint8()) { + auto c1 = check_and_get_column<ColumnUInt8>(col1); + auto c2 = check_and_get_column<ColumnUInt8>(col2); + for (size_t i = 0; i < column_size; ++i) { + EXPECT_EQ(c1->get_element(i), c2->get_element(i)); + } + } else if (WhichDataType(dataType).is_uint16()) { + auto c1 = check_and_get_column<ColumnUInt16>(col1); + auto c2 = check_and_get_column<ColumnUInt16>(col2); + for (size_t i = 0; i < column_size; ++i) { + EXPECT_EQ(c1->get_element(i), c2->get_element(i)); + } + } else if (WhichDataType(dataType).is_uint32()) { + auto c1 = check_and_get_column<ColumnUInt32>(col1); + auto c2 = check_and_get_column<ColumnUInt32>(col2); + for (size_t i = 0; i < column_size; ++i) { + EXPECT_EQ(c1->get_element(i), c2->get_element(i)); + } + } else if (WhichDataType(dataType).is_uint64()) { + auto c1 = check_and_get_column<ColumnUInt64>(col1); + auto c2 = check_and_get_column<ColumnUInt64>(col2); + for (size_t i = 0; i < column_size; ++i) { + EXPECT_EQ(c1->get_element(i), c2->get_element(i)); + } + } else if (WhichDataType(dataType).is_decimal()) { + auto c1 = check_and_get_column<ColumnDecimal64>(col1); + auto c2 = check_and_get_column<ColumnDecimal64>(col2); + for (size_t i = 0; i < column_size; ++i) { + EXPECT_EQ(c1->get_element(i), c2->get_element(i)); + } + } else if (WhichDataType(dataType).is_decimal32()) { + auto c1 = check_and_get_column<ColumnDecimal32>(col1); + auto c2 = check_and_get_column<ColumnDecimal32>(col2); + for (size_t i = 0; i < column_size; ++i) { + EXPECT_EQ(c1->get_element(i), c2->get_element(i)); + } + } else if (WhichDataType(dataType).is_decimal64()) { + auto c1 = check_and_get_column<ColumnDecimal64>(col1); + auto c2 = check_and_get_column<ColumnDecimal64>(col2); + for (size_t i = 0; i < column_size; ++i) { + EXPECT_EQ(c1->get_element(i), c2->get_element(i)); + } + } else if (WhichDataType(dataType).is_decimal128v2()) { + auto c1 = check_and_get_column<ColumnDecimal128V2>(col1); + auto c2 = check_and_get_column<ColumnDecimal128V2>(col2); + for (size_t i = 0; i < column_size; ++i) { + EXPECT_EQ(c1->get_element(i), c2->get_element(i)); + } + } else if (WhichDataType(dataType).is_decimal128v3()) { + auto c1 = check_and_get_column<ColumnDecimal128V3>(col1); + auto c2 = check_and_get_column<ColumnDecimal128V3>(col2); + for (size_t i = 0; i < column_size; ++i) { + EXPECT_EQ(c1->get_element(i), c2->get_element(i)); + } + } else if (WhichDataType(dataType).is_decimal256()) { + auto c1 = check_and_get_column<ColumnDecimal<Decimal256>>(col1); + auto c2 = check_and_get_column<ColumnDecimal<Decimal256>>(col1); + for (size_t i = 0; i < column_size; ++i) { + EXPECT_EQ(c1->get_element(i), c2->get_element(i)); + } + } else { + for (size_t i = 0; i < column_size; ++i) { + EXPECT_EQ(col1.get_data_at(i), col2.get_data_at(i)); + } + } + } + } + + void printColumn(const IColumn& column, const IDataType& dataType) { + std::cout << "column total size: " << column.size() << std::endl; + if (WhichDataType(dataType).is_map()) { + auto map = check_and_get_column<ColumnMap>(column); + std::cout << "map {keys, values}" << std::endl; + const DataTypeMap& rhs_map = static_cast<const DataTypeMap&>(dataType); + printColumn(map->get_keys(), *rhs_map.get_key_type()); + printColumn(map->get_values(), *rhs_map.get_value_type()); + } else if (WhichDataType(dataType).is_array()) { + auto array = check_and_get_column<ColumnArray>(column); + std::cout << "array: " << std::endl; + const auto& rhs_array = static_cast<const DataTypeArray&>(dataType); + printColumn(array->get_data(), *rhs_array.get_nested_type()); + } else { + size_t column_size = column.size(); + std::cout << column.get_name() << ": " << std::endl; + if (WhichDataType(dataType).is_int8()) { + auto col = check_and_get_column<ColumnInt8>(column); + for (size_t i = 0; i < column_size; ++i) { + std::cout << col->get_element(i) << " "; + } + } else if (WhichDataType(dataType).is_int16()) { + auto col = check_and_get_column<ColumnInt16>(column); + for (size_t i = 0; i < column_size; ++i) { + std::cout << col->get_element(i) << " "; + } + } else if (WhichDataType(dataType).is_int32()) { + auto col = check_and_get_column<ColumnInt32>(column); + for (size_t i = 0; i < column_size; ++i) { + std::cout << col->get_element(i) << " "; + } + } else if (WhichDataType(dataType).is_int64()) { + auto col = check_and_get_column<ColumnInt64>(column); + for (size_t i = 0; i < column_size; ++i) { + std::cout << col->get_element(i) << " "; + } + } else if (WhichDataType(dataType).is_int128()) { + auto col = check_and_get_column<ColumnInt128>(column); + for (size_t i = 0; i < column_size; ++i) { + std::cout << col->get_element(i) << " "; + } + } else if (WhichDataType(dataType).is_float32()) { + auto col = check_and_get_column<ColumnFloat32>(column); + for (size_t i = 0; i < column_size; ++i) { + std::cout << col->get_element(i) << " "; + } + } else if (WhichDataType(dataType).is_float64()) { + auto col = check_and_get_column<ColumnFloat64>(column); + for (size_t i = 0; i < column_size; ++i) { + std::cout << col->get_element(i) << " "; + } + } else if (WhichDataType(dataType).is_uint8()) { + auto col = check_and_get_column<ColumnUInt8>(column); + for (size_t i = 0; i < column_size; ++i) { + std::cout << col->get_element(i) << " "; + } + } else if (WhichDataType(dataType).is_uint16()) { + auto col = check_and_get_column<ColumnUInt16>(column); + for (size_t i = 0; i < column_size; ++i) { + std::cout << col->get_element(i) << " "; + } + } else if (WhichDataType(dataType).is_uint32()) { + auto col = check_and_get_column<ColumnUInt32>(column); + for (size_t i = 0; i < column_size; ++i) { + std::cout << col->get_element(i) << " "; + } + } else if (WhichDataType(dataType).is_uint64()) { + auto col = check_and_get_column<ColumnUInt64>(column); + for (size_t i = 0; i < column_size; ++i) { + std::cout << col->get_element(i) << " "; + } + } else if (WhichDataType(dataType).is_uint128()) { + auto col = check_and_get_column<ColumnUInt128>(column); + for (size_t i = 0; i < column_size; ++i) { + std::cout << col->get_data_at(i) << " "; + } + } else if (WhichDataType(dataType).is_decimal()) { + auto col = check_and_get_column<ColumnDecimal64>(column); + for (size_t i = 0; i < column_size; ++i) { + std::cout << col->get_element(i) << " "; + } + } else if (WhichDataType(dataType).is_decimal32()) { + auto col = check_and_get_column<ColumnDecimal32>(column); + for (size_t i = 0; i < column_size; ++i) { + std::cout << col->get_element(i) << " "; + } + } else if (WhichDataType(dataType).is_decimal64()) { + auto col = check_and_get_column<ColumnDecimal64>(column); + for (size_t i = 0; i < column_size; ++i) { + std::cout << col->get_element(i) << " "; + } + } else if (WhichDataType(dataType).is_decimal128v2()) { + auto col = check_and_get_column<ColumnDecimal128V2>(column); + for (size_t i = 0; i < column_size; ++i) { + std::cout << col->get_element(i) << " "; + } + } else if (WhichDataType(dataType).is_decimal128v3()) { + auto col = check_and_get_column<ColumnDecimal128V3>(column); + for (size_t i = 0; i < column_size; ++i) { + std::cout << col->get_element(i) << " "; + } + } else if (WhichDataType(dataType).is_decimal256()) { + auto col = check_and_get_column<ColumnDecimal<Decimal256>>(column); + for (size_t i = 0; i < column_size; ++i) { + std::cout << col->get_element(i) << " "; + } + } else if (WhichDataType(dataType).is_date()) { + auto col = check_and_get_column<ColumnDate>(column); + for (size_t i = 0; i < column_size; ++i) { + std::cout << col->get_data_at(i) << " "; + } + } else if (WhichDataType(dataType).is_date_time()) { + auto col = check_and_get_column<ColumnDateTime>(column); + for (size_t i = 0; i < column_size; ++i) { + std::cout << col->get_data_at(i) << " "; + } + } else if (WhichDataType(dataType).is_date_v2()) { + auto col = check_and_get_column<ColumnDateV2>(column); + for (size_t i = 0; i < column_size; ++i) { + std::cout << col->get_data_at(i) << " "; + } + } else if (WhichDataType(dataType).is_date_time_v2()) { + auto col = check_and_get_column<ColumnDateTimeV2>(column); + for (size_t i = 0; i < column_size; ++i) { + std::cout << col->get_data_at(i) << " "; + } + } else { + std::cout << "data type: " << dataType.get_name() << std::endl; + std::cout << "column type: " << column.get_name() << std::endl; + for (size_t i = 0; i < column_size; ++i) { + std::cout << column.get_data_at(i).to_string() << " "; + } + } + std::cout << std::endl; + } + } + // column size changed calculation: + // size, reserve, resize, empty, byte_size, allocated_bytes, clone_resized, get_shrinked_column + // cut(LIMIT operation), shrink + void sizeAssert(MutableColumnPtr col, size_t expect_size) { + EXPECT_EQ(col->size(), expect_size); + } + + // empty just use size() == 0 to impl as default behavior + void emptyAssert(MutableColumnPtr col) { EXPECT_EQ(col->size(), 0); } + + // reserve, resize, byte_size, allocated_bytes, clone_resized, get_shrinked_column + void reserveAssert(MutableColumnPtr col, size_t expect_size) { + col->reserve(expect_size); + EXPECT_EQ(col->allocated_bytes(), expect_size); + } + + // cut(LIMIT operation) will cut the column with the given from and to, and return the new column + // notice return column is clone from origin column + void cutAssert(MutableColumnPtr col, size_t from, size_t to, size_t expect_size) { + auto ori = col->size(); + auto ptr = col->cut(from, to); + EXPECT_EQ(ptr->size(), expect_size); + EXPECT_EQ(col->size(), ori); + } + + // shrink is cut/append the column with the given size, which called from Block::set_num_rows + // and some Operator may call this set_num_rows to make rows satisfied, like limit operation + // but different from cut behavior which + // return column is mutate from origin column + void shrinkAssert(MutableColumnPtr col, size_t shrink_size) { + auto ptr = col->shrink(shrink_size); + EXPECT_EQ(ptr->size(), shrink_size); + EXPECT_EQ(col->size(), shrink_size); + } + + // resize has fixed-column implementation and variable-column implementation + // like string column, the resize will resize the offsets column but not the data column (because it doesn't matter the size of data column, all operation for string column is based on the offsets column) + // like vector column, the resize will resize the data column + // like array column, the resize will resize the offsets column and the data column (which in creator we have check staff for the size of data column is the same as the size of offsets column) + void resizeAssert(MutableColumnPtr col, size_t expect_size) { + col->resize(expect_size); + EXPECT_EQ(col->size(), expect_size); + } + + // replicate is clone with new column from the origin column, always from ColumnConst to expand the column + void replicateAssert(MutableColumnPtr col, IColumn::Offsets& offsets) { + auto new_col = col->replicate(offsets); + EXPECT_EQ(new_col->size(), offsets.back()); + } + + // byte size is just appriximate size of the column + // as fixed column type, like column_vector, the byte size is sizeof(columnType) * size() + // as variable column type, like column_string, the byte size is sum of chars size() and offsets size * sizeof(offsetType) + void byteSizeAssert(MutableColumnPtr col, size_t expect_size) { + EXPECT_EQ(col->byte_size(), expect_size); + } + + // allocated bytes is the real size of the column + void allocatedBytesAssert(MutableColumnPtr col, size_t expect_size) { + EXPECT_EQ(col->allocated_bytes(), expect_size); + } + + // clone_resized will clone the column and cut/append to the new column with the size of the original column + void cloneResizedAssert(MutableColumnPtr col, size_t expect_size) { + auto new_col = col->clone_resized(expect_size); + EXPECT_EQ(new_col->size(), expect_size); + } + + // get_shrinked_column should only happened in char-type column or nested char-type column + // just shrink the end zeros for char-type column which happened in segmentIterator + // eg. column_desc: char(6), insert into char(3), the char(3) will padding the 3 zeros at the end for writing to disk. + // but we select should just print the char(3) without the padding zeros + // limit and topN operation will trigger this function call + void getShrinkedColumnAssert(MutableColumnPtr col, size_t spcific_size_defined) { + EXPECT_TRUE(col->could_shrinked_column()); + auto new_col = col->get_shrinked_column(); + for (size_t i = 0; i < new_col->size(); i++) { + EXPECT_EQ(col->get_data_at(i).size, spcific_size_defined); + } + } + + //serialize and deserialize which usually used in AGG function: + // serialize_value_into_arena, deserialize_and_insert_from_arena (called by AggregateFunctionDistinctMultipleGenericData, group_array_intersect, nested-types serder like: DataTypeArraySerDe::write_one_cell_to_jsonb) + void ser_deserialize_with_arena_impl(MutableColumns& columns, const DataTypes& data_types) { Review Comment: because this situation is can pass some columns at one time , which in serialize_value_into_arena/deserialize_and_insert_from_arena caller -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org