github-actions[bot] commented on code in PR #36289: URL: https://github.com/apache/doris/pull/36289#discussion_r1644468430
########## be/src/vec/sink/writer/iceberg/partition_transformers.h: ########## @@ -43,38 +43,1386 @@ class PartitionColumnTransforms { const doris::iceberg::PartitionField& field, const TypeDescriptor& source_type); }; +class PartitionColumnTransformUtils { +public: + static DateV2Value<DateV2ValueType>& epoch_date() { + static DateV2Value<DateV2ValueType> epoch_date; + static bool initialized = false; + if (!initialized) { + epoch_date.from_date_str("1970-01-01 00:00:00", 19); + initialized = true; + } + return epoch_date; + } + + static DateV2Value<DateTimeV2ValueType>& epoch_datetime() { + static DateV2Value<DateTimeV2ValueType> epoch_datetime; + static bool initialized = false; + if (!initialized) { + epoch_datetime.from_date_str("1970-01-01 00:00:00", 19); + initialized = true; + } + return epoch_datetime; + } + + static std::string human_year(int year_ordinal) { + auto year = std::chrono::year_month_day( + std::chrono::sys_days(std::chrono::floor<std::chrono::days>( + EPOCH + std::chrono::years(year_ordinal)))) + .year(); + return std::to_string(static_cast<int>(year)); + } + + static std::string human_month(int month_ordinal) { + auto ymd = std::chrono::year_month_day(std::chrono::sys_days( + std::chrono::floor<std::chrono::days>(EPOCH + std::chrono::months(month_ordinal)))); + return fmt::format("{:04d}-{:02d}", static_cast<int>(ymd.year()), + static_cast<unsigned>(ymd.month())); + } + + static std::string human_day(int day_ordinal) { + auto ymd = std::chrono::year_month_day(std::chrono::sys_days( + std::chrono::floor<std::chrono::days>(EPOCH + std::chrono::days(day_ordinal)))); + return fmt::format("{:04d}-{:02d}-{:02d}", static_cast<int>(ymd.year()), + static_cast<unsigned>(ymd.month()), static_cast<unsigned>(ymd.day())); + } + + static std::string human_hour(int hour_ordinal) { + int day_value = hour_ordinal / 24; + int housr_value = hour_ordinal % 24; + auto ymd = std::chrono::year_month_day(std::chrono::sys_days( + std::chrono::floor<std::chrono::days>(EPOCH + std::chrono::days(day_value)))); + return fmt::format("{:04d}-{:02d}-{:02d}-{:02d}", static_cast<int>(ymd.year()), + static_cast<unsigned>(ymd.month()), static_cast<unsigned>(ymd.day()), + housr_value); + } + +private: + static const std::chrono::time_point<std::chrono::system_clock> EPOCH; + PartitionColumnTransformUtils() = default; +}; + class PartitionColumnTransform { public: PartitionColumnTransform() = default; virtual ~PartitionColumnTransform() = default; - virtual bool preserves_non_null() const { return false; } - - virtual bool monotonic() const { return true; } - - virtual bool temporal() const { return false; } + virtual std::string name() const; virtual const TypeDescriptor& get_result_type() const = 0; - virtual bool is_void() const { return false; } - - virtual ColumnWithTypeAndName apply(Block& block, int idx) = 0; + virtual ColumnWithTypeAndName apply(Block& block, int column_pos) = 0; virtual std::string to_human_string(const TypeDescriptor& type, const std::any& value) const; + + virtual std::string get_partition_value(const TypeDescriptor& type, + const std::any& value) const; }; class IdentityPartitionColumnTransform : public PartitionColumnTransform { public: IdentityPartitionColumnTransform(const TypeDescriptor& source_type) : _source_type(source_type) {} - virtual const TypeDescriptor& get_result_type() const { return _source_type; } + std::string name() const override { return "Identity"; } + + const TypeDescriptor& get_result_type() const override { return _source_type; } + + ColumnWithTypeAndName apply(Block& block, int column_pos) override { + const ColumnWithTypeAndName& column_with_type_and_name = block.get_by_position(column_pos); + return {column_with_type_and_name.column, column_with_type_and_name.type, + column_with_type_and_name.name}; + } + +private: + TypeDescriptor _source_type; +}; + +class StringTruncatePartitionColumnTransform : public PartitionColumnTransform { +public: + StringTruncatePartitionColumnTransform(const TypeDescriptor& source_type, int width) + : _source_type(source_type), _width(width) {} + + std::string name() const override { return "StringTruncate"; } + + const TypeDescriptor& get_result_type() const override { return _source_type; } + + ColumnWithTypeAndName apply(Block& block, int column_pos) override { + auto int_type = std::make_shared<DataTypeInt32>(); + size_t num_columns_without_result = block.columns(); + const ColumnWithTypeAndName& column_with_type_and_name = block.get_by_position(column_pos); + + ColumnPtr string_column_ptr; + ColumnPtr null_map_column_ptr; + bool is_nullable = false; + if (auto* nullable_column = + check_and_get_column<ColumnNullable>(column_with_type_and_name.column)) { + null_map_column_ptr = nullable_column->get_null_map_column_ptr(); + string_column_ptr = nullable_column->get_nested_column_ptr(); + is_nullable = true; + } else { + string_column_ptr = column_with_type_and_name.column; + is_nullable = false; + } + block.replace_by_position(column_pos, std::move(string_column_ptr)); + block.insert( + {int_type->create_column_const(block.rows(), to_field(1)), int_type, "const 1"}); + block.insert({int_type->create_column_const(block.rows(), to_field(_width)), int_type, + fmt::format("const {}", _width)}); + block.insert({nullptr, std::make_shared<DataTypeString>(), "result"}); + ColumnNumbers temp_arguments(3); + temp_arguments[0] = column_pos; // str column + temp_arguments[1] = num_columns_without_result; // pos + temp_arguments[2] = num_columns_without_result + 1; // width + size_t result_column_id = num_columns_without_result + 2; + + SubstringUtil::substring_execute(block, temp_arguments, result_column_id, block.rows()); + if (is_nullable) { + auto res_column = ColumnNullable::create(block.get_by_position(result_column_id).column, + null_map_column_ptr); + Block::erase_useless_column(&block, num_columns_without_result); + return {std::move(res_column), + DataTypeFactory::instance().create_data_type(get_result_type(), true), + column_with_type_and_name.name}; + } else { + auto res_column = block.get_by_position(result_column_id).column; + Block::erase_useless_column(&block, num_columns_without_result); + return {std::move(res_column), + DataTypeFactory::instance().create_data_type(get_result_type(), false), + column_with_type_and_name.name}; + } + } + +private: + TypeDescriptor _source_type; + int _width; +}; + +class IntegerTruncatePartitionColumnTransform : public PartitionColumnTransform { +public: + IntegerTruncatePartitionColumnTransform(const TypeDescriptor& source_type, int width) + : _source_type(source_type), _width(width) {} + + std::string name() const override { return "IntegerTruncate"; } + + const TypeDescriptor& get_result_type() const override { return _source_type; } + + ColumnWithTypeAndName apply(Block& block, int column_pos) override { + const ColumnWithTypeAndName& column_with_type_and_name = block.get_by_position(column_pos); + + ColumnPtr column_ptr; + ColumnPtr null_map_column_ptr; + bool is_nullable = false; + if (auto* nullable_column = + check_and_get_column<ColumnNullable>(column_with_type_and_name.column)) { + null_map_column_ptr = nullable_column->get_null_map_column_ptr(); + column_ptr = nullable_column->get_nested_column_ptr(); + is_nullable = true; + } else { + column_ptr = column_with_type_and_name.column; + is_nullable = false; + } + if (const ColumnInt32* col_integer = check_and_get_column<ColumnInt32>(column_ptr)) { + auto col_res = ColumnInt32::create(); + ColumnInt32::Container& out_data = col_res->get_data(); + out_data.resize(col_integer->get_data().size()); + const ColumnInt32::Container& in_data = col_integer->get_data(); + const int* end_in = in_data.data() + in_data.size(); + + const Int32* __restrict p_in = in_data.data(); + Int32* __restrict p_out = out_data.data(); + + while (p_in < end_in) { + *p_out = *p_in - ((*p_in % _width) + _width) % _width; + ++p_in; + ++p_out; + } + if (is_nullable) { + auto res_column = ColumnNullable::create(std::move(col_res), null_map_column_ptr); + return {std::move(res_column), + DataTypeFactory::instance().create_data_type(get_result_type(), true), + column_with_type_and_name.name}; + } else { + return {std::move(col_res), + DataTypeFactory::instance().create_data_type(get_result_type(), false), + column_with_type_and_name.name}; + } + } else if (auto col_right_const = check_and_get_column_const<ColumnInt32>(column_ptr)) { + throw doris::Exception(doris::ErrorCode::INTERNAL_ERROR, + "IntegerTruncatePartitionColumnTransform transform partition " + "error use column_pos {} ", + column_pos); + } else { + throw doris::Exception(doris::ErrorCode::INTERNAL_ERROR, + "IntegerTruncatePartitionColumnTransform transform partition " + "error use column_pos {} ", + column_pos); + } + } + +private: + TypeDescriptor _source_type; + int _width; +}; + +class BigintTruncatePartitionColumnTransform : public PartitionColumnTransform { +public: + BigintTruncatePartitionColumnTransform(const TypeDescriptor& source_type, int width) + : _source_type(source_type), _width(width) {} + + std::string name() const override { return "BigintTruncate"; } + + const TypeDescriptor& get_result_type() const override { return _source_type; } + + ColumnWithTypeAndName apply(Block& block, int column_pos) override { + const ColumnWithTypeAndName& column_with_type_and_name = block.get_by_position(column_pos); + + ColumnPtr column_ptr; + ColumnPtr null_map_column_ptr; + bool is_nullable = false; + if (auto* nullable_column = + check_and_get_column<ColumnNullable>(column_with_type_and_name.column)) { + null_map_column_ptr = nullable_column->get_null_map_column_ptr(); + column_ptr = nullable_column->get_nested_column_ptr(); + is_nullable = true; + } else { + column_ptr = column_with_type_and_name.column; + is_nullable = false; + } + if (const ColumnInt64* col_integer = check_and_get_column<ColumnInt64>(column_ptr)) { + auto col_res = ColumnInt64::create(); + ColumnInt64::Container& out_data = col_res->get_data(); + out_data.resize(col_integer->get_data().size()); + const ColumnInt64::Container& in_data = col_integer->get_data(); + const Int64* end_in = in_data.data() + in_data.size(); + + const Int64* __restrict p_in = in_data.data(); + Int64* __restrict p_out = out_data.data(); + + while (p_in < end_in) { + *p_out = *p_in - ((*p_in % _width) + _width) % _width; + ++p_in; + ++p_out; + } + if (is_nullable) { + auto res_column = ColumnNullable::create(std::move(col_res), null_map_column_ptr); + return {std::move(res_column), + DataTypeFactory::instance().create_data_type(get_result_type(), true), + column_with_type_and_name.name}; + } else { + return {std::move(col_res), + DataTypeFactory::instance().create_data_type(get_result_type(), false), + column_with_type_and_name.name}; + } + } else if (auto col_right_const = check_and_get_column_const<ColumnInt64>(column_ptr)) { + throw doris::Exception(doris::ErrorCode::INTERNAL_ERROR, + "BigintTruncatePartitionColumnTransform transform partition " + "error use column_pos {} ", + column_pos); + } else { + throw doris::Exception(doris::ErrorCode::INTERNAL_ERROR, + "BigintTruncatePartitionColumnTransform transform partition " + "error use column_pos {} ", + column_pos); + } + } + +private: + TypeDescriptor _source_type; + int _width; +}; + +template <typename T> +class DecimalTruncatePartitionColumnTransform : public PartitionColumnTransform { +public: + DecimalTruncatePartitionColumnTransform(const TypeDescriptor& source_type, int width) + : _source_type(source_type), _width(width) {} + + std::string name() const override { return "DecimalTruncate"; } + + const TypeDescriptor& get_result_type() const override { return _source_type; } + + ColumnWithTypeAndName apply(Block& block, int column_pos) override { + const ColumnWithTypeAndName& column_with_type_and_name = block.get_by_position(column_pos); + + ColumnPtr column_ptr; + ColumnPtr null_map_column_ptr; + bool is_nullable = false; + if (auto* nullable_column = + check_and_get_column<ColumnNullable>(column_with_type_and_name.column)) { + null_map_column_ptr = nullable_column->get_null_map_column_ptr(); + column_ptr = nullable_column->get_nested_column_ptr(); + is_nullable = true; + } else { + column_ptr = column_with_type_and_name.column; + is_nullable = false; + } + + const auto* const decimal_col = check_and_get_column<ColumnDecimal<T>>(column_ptr); + const auto& vec_src = decimal_col->get_data(); + + auto col_res = ColumnDecimal<T>::create(vec_src.size(), decimal_col->get_scale()); + auto& vec_res = col_res->get_data(); + + const typename T::NativeType* __restrict p_in = + reinterpret_cast<const T::NativeType*>(vec_src.data()); + const typename T::NativeType* end_in = + reinterpret_cast<const T::NativeType*>(vec_src.data()) + vec_src.size(); + typename T::NativeType* __restrict p_out = reinterpret_cast<T::NativeType*>(vec_res.data()); + + while (p_in < end_in) { + typename T::NativeType remainder = ((*p_in % _width) + _width) % _width; + *p_out = *p_in - remainder; + ++p_in; + ++p_out; + } + + if (is_nullable) { + auto res_column = ColumnNullable::create(std::move(col_res), null_map_column_ptr); + return {res_column, + DataTypeFactory::instance().create_data_type(get_result_type(), true), + column_with_type_and_name.name}; + } else { + return {std::move(col_res), + DataTypeFactory::instance().create_data_type(get_result_type(), false), + column_with_type_and_name.name}; + } + } + +private: + TypeDescriptor _source_type; + int _width; +}; + +class IntBucketPartitionColumnTransform : public PartitionColumnTransform { +public: + IntBucketPartitionColumnTransform(const TypeDescriptor& source_type, int bucket_num) + : _source_type(source_type), _bucket_num(bucket_num), _target_type(TYPE_INT) {} + + std::string name() const override { return "IntBucket"; } + + const TypeDescriptor& get_result_type() const override { return _target_type; } + + ColumnWithTypeAndName apply(Block& block, int column_pos) override { + const ColumnWithTypeAndName& column_with_type_and_name = block.get_by_position(column_pos); + + ColumnPtr column_ptr; + ColumnPtr null_map_column_ptr; + bool is_nullable = false; + if (auto* nullable_column = + check_and_get_column<ColumnNullable>(column_with_type_and_name.column)) { + null_map_column_ptr = nullable_column->get_null_map_column_ptr(); + column_ptr = nullable_column->get_nested_column_ptr(); + is_nullable = true; + } else { + column_ptr = column_with_type_and_name.column; + is_nullable = false; + } + if (const ColumnInt32* col_integer = check_and_get_column<ColumnInt32>(column_ptr)) { + auto col_res = ColumnInt32::create(); + ColumnInt32::Container& out_data = col_res->get_data(); + out_data.resize(col_integer->get_data().size()); + + const ColumnInt32::Container& in_data = col_integer->get_data(); + const int* end_in = in_data.data() + in_data.size(); + + const Int32* __restrict p_in = in_data.data(); + Int32* __restrict p_out = out_data.data(); + + while (p_in < end_in) { + Int64 long_value = static_cast<Int64>(*p_in); + uint32_t hash_value = HashUtil::murmur_hash3_32(&long_value, sizeof(long_value), 0); + + *p_out = ((hash_value >> 1) & INT32_MAX) % _bucket_num; + ++p_in; + ++p_out; + } + if (is_nullable) { + auto res_column = ColumnNullable::create(std::move(col_res), null_map_column_ptr); + return {res_column, + DataTypeFactory::instance().create_data_type(get_result_type(), true), + column_with_type_and_name.name}; + } else { + return {std::move(col_res), + DataTypeFactory::instance().create_data_type(get_result_type(), false), + column_with_type_and_name.name}; + } + } else { + //assert(0); + throw doris::Exception(doris::ErrorCode::INTERNAL_ERROR, + "IntBucketPartitionColumnTransform transform partition error " + "use column_pos {} ", + column_pos); + } + } + +private: + TypeDescriptor _source_type; + int _bucket_num; + TypeDescriptor _target_type; +}; + +class BigintBucketPartitionColumnTransform : public PartitionColumnTransform { +public: + BigintBucketPartitionColumnTransform(const TypeDescriptor& source_type, int bucket_num) + : _source_type(source_type), _bucket_num(bucket_num), _target_type(TYPE_INT) {} + + std::string name() const override { return "BigintBucket"; } + + const TypeDescriptor& get_result_type() const override { return _target_type; } + + ColumnWithTypeAndName apply(Block& block, int column_pos) override { + const ColumnWithTypeAndName& column_with_type_and_name = block.get_by_position(column_pos); + + ColumnPtr column_ptr; + ColumnPtr null_map_column_ptr; + bool is_nullable = false; + if (auto* nullable_column = + check_and_get_column<ColumnNullable>(column_with_type_and_name.column)) { + null_map_column_ptr = nullable_column->get_null_map_column_ptr(); + column_ptr = nullable_column->get_nested_column_ptr(); + is_nullable = true; + } else { + column_ptr = column_with_type_and_name.column; + is_nullable = false; + } + if (const ColumnInt64* col_integer = check_and_get_column<ColumnInt64>(column_ptr)) { + auto col_res = ColumnInt64::create(); + ColumnInt64::Container& out_data = col_res->get_data(); + out_data.resize(col_integer->get_data().size()); + + const ColumnInt64::Container& in_data = col_integer->get_data(); + const Int64* end_in = in_data.data() + in_data.size(); + + const Int64* __restrict p_in = in_data.data(); + Int64* __restrict p_out = out_data.data(); + + while (p_in < end_in) { + Int64 long_value = static_cast<Int64>(*p_in); Review Comment: warning: use auto when initializing with a cast to avoid duplicating the type name [modernize-use-auto] ```suggestion auto long_value = static_cast<Int64>(*p_in); ``` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org