This is an automated email from the ASF dual-hosted git repository. morningman pushed a commit to branch branch-2.1 in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-2.1 by this push: new 1bfe0f03935 [feature](iceberg)support read iceberg complex type,iceberg.orc format and position delete. (#33935) (#34256) 1bfe0f03935 is described below commit 1bfe0f03935e751e66b95e691b86c64a6c344493 Author: daidai <2017501...@qq.com> AuthorDate: Mon Apr 29 14:40:12 2024 +0800 [feature](iceberg)support read iceberg complex type,iceberg.orc format and position delete. (#33935) (#34256) master #33935 --- be/src/vec/exec/format/orc/vorc_reader.cpp | 46 +- be/src/vec/exec/format/orc/vorc_reader.h | 19 + be/src/vec/exec/format/table/iceberg_reader.cpp | 508 +++++++++++---------- be/src/vec/exec/format/table/iceberg_reader.h | 136 ++++-- be/src/vec/exec/scan/vfile_scanner.cpp | 23 +- .../doris/datasource/iceberg/IcebergUtils.java | 15 +- .../iceberg/iceberg_complex_type.out | 165 +++++++ .../iceberg/iceberg_position_delete.out | 196 ++++++++ .../iceberg/iceberg_schema_change.out | 305 +++++++++++++ .../iceberg/iceberg_complex_type.groovy | 92 ++++ .../iceberg/iceberg_position_delete.groovy | 195 ++++++++ .../iceberg/iceberg_schema_change.groovy | 162 +++++++ 12 files changed, 1597 insertions(+), 265 deletions(-) diff --git a/be/src/vec/exec/format/orc/vorc_reader.cpp b/be/src/vec/exec/format/orc/vorc_reader.cpp index 7619191a2e7..55e9b1af290 100644 --- a/be/src/vec/exec/format/orc/vorc_reader.cpp +++ b/be/src/vec/exec/format/orc/vorc_reader.cpp @@ -305,6 +305,19 @@ Status OrcReader::get_parsed_schema(std::vector<std::string>* col_names, return Status::OK(); } +Status OrcReader::get_schema_col_name_attribute(std::vector<std::string>* col_names, + std::vector<uint64_t>* col_attributes, + std::string attribute) { + RETURN_IF_ERROR(_create_file_reader()); + auto& root_type = _is_acid ? _remove_acid(_reader->getType()) : _reader->getType(); + for (int i = 0; i < root_type.getSubtypeCount(); ++i) { + col_names->emplace_back(get_field_name_lower_case(&root_type, i)); + col_attributes->emplace_back( + std::stol(root_type.getSubtype(i)->getAttributeValue(attribute))); + } + return Status::OK(); +} + Status OrcReader::_init_read_columns() { auto& root_type = _reader->getType(); std::vector<std::string> orc_cols; @@ -722,7 +735,11 @@ Status OrcReader::set_fill_columns( std::unordered_map<std::string, std::pair<uint32_t, int>> predicate_columns; std::function<void(VExpr * expr)> visit_slot = [&](VExpr* expr) { if (VSlotRef* slot_ref = typeid_cast<VSlotRef*>(expr)) { - auto& expr_name = slot_ref->expr_name(); + auto expr_name = slot_ref->expr_name(); + auto iter = _table_col_to_file_col.find(expr_name); + if (iter != _table_col_to_file_col.end()) { + expr_name = iter->second; + } predicate_columns.emplace(expr_name, std::make_pair(slot_ref->column_id(), slot_ref->slot_id())); if (slot_ref->column_id() == 0) { @@ -1587,6 +1604,7 @@ Status OrcReader::get_next_block(Block* block, size_t* read_rows, bool* eof) { *read_rows = 0; return Status::OK(); } + _execute_filter_position_delete_rowids(*_filter); RETURN_IF_CATCH_EXCEPTION(Block::filter_block_internal(block, columns_to_filter, *_filter)); if (!_not_single_slot_filter_conjuncts.empty()) { @@ -1694,6 +1712,10 @@ Status OrcReader::get_next_block(Block* block, size_t* read_rows, bool* eof) { for (auto& conjunct : _non_dict_filter_conjuncts) { filter_conjuncts.emplace_back(conjunct); } + //include missing_columns != missing_columns ; missing_column is null; missing_column != file_columns etc... + for (auto& [missing_col, conjunct] : _lazy_read_ctx.predicate_missing_columns) { + filter_conjuncts.emplace_back(conjunct); + } std::vector<IColumn::Filter*> filters; if (_delete_rows_filter_ptr) { filters.push_back(_delete_rows_filter_ptr.get()); @@ -1710,6 +1732,7 @@ Status OrcReader::get_next_block(Block* block, size_t* read_rows, bool* eof) { static_cast<void>(_convert_dict_cols_to_string_cols(block, &batch_vec)); return Status::OK(); } + _execute_filter_position_delete_rowids(result_filter); RETURN_IF_CATCH_EXCEPTION( Block::filter_block_internal(block, columns_to_filter, result_filter)); if (!_not_single_slot_filter_conjuncts.empty()) { @@ -1841,6 +1864,10 @@ Status OrcReader::filter(orc::ColumnVectorBatch& data, uint16_t* sel, uint16_t s for (auto& conjunct : _non_dict_filter_conjuncts) { filter_conjuncts.emplace_back(conjunct); } + //include missing_columns != missing_columns ; missing_column is null; missing_column != file_columns etc... + for (auto& [missing_col, conjunct] : _lazy_read_ctx.predicate_missing_columns) { + filter_conjuncts.emplace_back(conjunct); + } std::vector<IColumn::Filter*> filters; if (_delete_rows_filter_ptr) { filters.push_back(_delete_rows_filter_ptr.get()); @@ -1919,6 +1946,9 @@ bool OrcReader::_can_filter_by_dict(int slot_id) { break; } } + if (slot == nullptr) { + return false; + } if (!slot->type().is_string_type()) { return false; } @@ -2369,5 +2399,19 @@ void ORCFileInputStream::_collect_profile_before_close() { _file_reader->collect_profile_before_close(); } } +void OrcReader::_execute_filter_position_delete_rowids(IColumn::Filter& filter) { + if (_position_delete_ordered_rowids == nullptr) { + return; + } + auto start = _row_reader->getRowNumber(); + auto nums = _batch->numElements; + auto l = std::lower_bound(_position_delete_ordered_rowids->begin(), + _position_delete_ordered_rowids->end(), start); + auto r = std::upper_bound(_position_delete_ordered_rowids->begin(), + _position_delete_ordered_rowids->end(), start + nums - 1); + for (; l < r; l++) { + filter[*l - start] = 0; + } +} } // namespace doris::vectorized diff --git a/be/src/vec/exec/format/orc/vorc_reader.h b/be/src/vec/exec/format/orc/vorc_reader.h index 0b07f147c4f..f5bb7004ca2 100644 --- a/be/src/vec/exec/format/orc/vorc_reader.h +++ b/be/src/vec/exec/format/orc/vorc_reader.h @@ -177,6 +177,19 @@ public: Status get_parsed_schema(std::vector<std::string>* col_names, std::vector<TypeDescriptor>* col_types) override; + Status get_schema_col_name_attribute(std::vector<std::string>* col_names, + std::vector<uint64_t>* col_attributes, + std::string attribute); + void set_table_col_to_file_col( + std::unordered_map<std::string, std::string> table_col_to_file_col) { + _table_col_to_file_col = table_col_to_file_col; + } + + void set_position_delete_rowids(vector<int64_t>* delete_rows) { + _position_delete_ordered_rowids = delete_rows; + } + void _execute_filter_position_delete_rowids(IColumn::Filter& filter); + void set_delete_rows(const TransactionalHiveReader::AcidRowIDSet* delete_rows) { _delete_rows = delete_rows; } @@ -246,6 +259,8 @@ private: OrcReader* _orc_reader = nullptr; }; + //class RowFilter : public orc::RowReader + // Create inner orc file, // return EOF if file is empty // return EROOR if encounter error. @@ -586,6 +601,10 @@ private: // resolve schema change std::unordered_map<std::string, std::unique_ptr<converter::ColumnTypeConverter>> _converters; + //for iceberg table , when table column name != file column name + std::unordered_map<std::string, std::string> _table_col_to_file_col; + //support iceberg position delete . + std::vector<int64_t>* _position_delete_ordered_rowids = nullptr; }; class ORCFileInputStream : public orc::InputStream, public ProfileCollector { diff --git a/be/src/vec/exec/format/table/iceberg_reader.cpp b/be/src/vec/exec/format/table/iceberg_reader.cpp index 8c05b8c2a08..7e5a5bf6db7 100644 --- a/be/src/vec/exec/format/table/iceberg_reader.cpp +++ b/be/src/vec/exec/format/table/iceberg_reader.cpp @@ -49,12 +49,10 @@ #include "vec/common/string_ref.h" #include "vec/core/block.h" #include "vec/core/column_with_type_and_name.h" -#include "vec/data_types/data_type.h" #include "vec/data_types/data_type_factory.hpp" #include "vec/exec/format/format_common.h" #include "vec/exec/format/generic_reader.h" -#include "vec/exec/format/parquet/parquet_common.h" -#include "vec/exec/format/parquet/vparquet_reader.h" +#include "vec/exec/format/orc/vorc_reader.h" #include "vec/exec/format/table/table_format_reader.h" namespace cctz { @@ -74,17 +72,6 @@ class VExprContext; } // namespace doris namespace doris::vectorized { - -using DeleteRows = std::vector<int64_t>; -using DeleteFile = phmap::parallel_flat_hash_map< - std::string, std::unique_ptr<DeleteRows>, std::hash<std::string>, - std::equal_to<std::string>, - std::allocator<std::pair<const std::string, std::unique_ptr<DeleteRows>>>, 8, std::mutex>; - -const int64_t MIN_SUPPORT_DELETE_FILES_VERSION = 2; -const std::string ICEBERG_ROW_POS = "pos"; -const std::string ICEBERG_FILE_PATH = "file_path"; - IcebergTableReader::IcebergTableReader(std::unique_ptr<GenericReader> file_format_reader, RuntimeProfile* profile, RuntimeState* state, const TFileScanRangeParams& params, @@ -110,33 +97,6 @@ IcebergTableReader::IcebergTableReader(std::unique_ptr<GenericReader> file_forma ADD_CHILD_TIMER(_profile, "DeleteRowsSortTime", iceberg_profile); } -Status IcebergTableReader::init_reader( - const std::vector<std::string>& file_col_names, - const std::unordered_map<int, std::string>& col_id_name_map, - std::unordered_map<std::string, ColumnValueRangeType>* colname_to_value_range, - const VExprContextSPtrs& conjuncts, const TupleDescriptor* tuple_descriptor, - const RowDescriptor* row_descriptor, - const std::unordered_map<std::string, int>* colname_to_slot_id, - const VExprContextSPtrs* not_single_slot_filter_conjuncts, - const std::unordered_map<int, VExprContextSPtrs>* slot_id_to_filter_conjuncts) { - ParquetReader* parquet_reader = static_cast<ParquetReader*>(_file_format_reader.get()); - _col_id_name_map = col_id_name_map; - _file_col_names = file_col_names; - _colname_to_value_range = colname_to_value_range; - auto parquet_meta_kv = parquet_reader->get_metadata_key_values(); - static_cast<void>(_gen_col_name_maps(parquet_meta_kv)); - _gen_file_col_names(); - _gen_new_colname_to_value_range(); - parquet_reader->set_table_to_file_col_map(_table_col_to_file_col); - parquet_reader->iceberg_sanitize(_all_required_col_names); - Status status = parquet_reader->init_reader( - _all_required_col_names, _not_in_file_col_names, &_new_colname_to_value_range, - conjuncts, tuple_descriptor, row_descriptor, colname_to_slot_id, - not_single_slot_filter_conjuncts, slot_id_to_filter_conjuncts); - - return status; -} - Status IcebergTableReader::get_next_block(Block* block, size_t* read_rows, bool* eof) { // already get rows from be if (_push_down_agg_type == TPushAggOp::type::COUNT && _remaining_push_down_count > 0) { @@ -220,18 +180,17 @@ Status IcebergTableReader::init_row_filters(const TFileRangeDesc& range) { } if (delete_file_type == POSITION_DELETE) { - RETURN_IF_ERROR(_position_delete(files)); + RETURN_IF_ERROR(_position_delete_base(files)); + } else if (delete_file_type == EQUALITY_DELETE) { + // todo: equality delete + // If it is a count operation and it has equality delete file kind, + // the push down operation of the count for this split needs to be canceled. + return Status::NotSupported("NOT SUPPORT EQUALITY_DELETE!"); } - - // todo: equality delete - // If it is a count operation and it has equality delete file kind, - // the push down operation of the count for this split needs to be canceled. - COUNTER_UPDATE(_iceberg_profile.num_delete_files, files.size()); return Status::OK(); } - -Status IcebergTableReader::_position_delete( +Status IcebergTableReader::_position_delete_base( const std::vector<TIcebergDeleteFileDesc>& delete_files) { std::string data_file_path = _range.path; // the path in _range is remove the namenode prefix, @@ -243,133 +202,31 @@ Status IcebergTableReader::_position_delete( } } - // position delete - ParquetReader* parquet_reader = (ParquetReader*)(_file_format_reader.get()); - RowRange whole_range = parquet_reader->get_whole_range(); - bool init_schema = false; - std::vector<std::string> delete_file_col_names; - std::vector<TypeDescriptor> delete_file_col_types; std::vector<DeleteRows*> delete_rows_array; int64_t num_delete_rows = 0; std::vector<DeleteFile*> erase_data; for (auto& delete_file : delete_files) { - if (whole_range.last_row <= delete_file.position_lower_bound || - whole_range.first_row > delete_file.position_upper_bound) { - continue; - } - SCOPED_TIMER(_iceberg_profile.delete_files_read_time); Status create_status = Status::OK(); - DeleteFile* delete_file_cache = _kv_cache->get< - DeleteFile>(_delet_file_cache_key(delete_file.path), [&]() -> DeleteFile* { - TFileRangeDesc delete_range; - // must use __set() method to make sure __isset is true - delete_range.__set_fs_name(_range.fs_name); - delete_range.path = delete_file.path; - delete_range.start_offset = 0; - delete_range.size = -1; - delete_range.file_size = -1; - ParquetReader delete_reader(_profile, _params, delete_range, 102400, - const_cast<cctz::time_zone*>(&_state->timezone_obj()), - _io_ctx, _state); - if (!init_schema) { - static_cast<void>(delete_reader.get_parsed_schema(&delete_file_col_names, - &delete_file_col_types)); - init_schema = true; - } - create_status = delete_reader.open(); - if (!create_status.ok()) { - return nullptr; - } - create_status = delete_reader.init_reader(delete_file_col_names, _not_in_file_col_names, - nullptr, {}, nullptr, nullptr, nullptr, - nullptr, nullptr, false); - if (!create_status.ok()) { - return nullptr; - } - - std::unordered_map<std::string, std::tuple<std::string, const SlotDescriptor*>> - partition_columns; - std::unordered_map<std::string, VExprContextSPtr> missing_columns; - static_cast<void>(delete_reader.set_fill_columns(partition_columns, missing_columns)); - - bool dictionary_coded = true; - const tparquet::FileMetaData* meta_data = delete_reader.get_meta_data(); - for (int i = 0; i < delete_file_col_names.size(); ++i) { - if (delete_file_col_names[i] == ICEBERG_FILE_PATH) { - for (int j = 0; j < meta_data->row_groups.size(); ++j) { - auto& column_chunk = meta_data->row_groups[j].columns[i]; - if (!(column_chunk.__isset.meta_data && - column_chunk.meta_data.__isset.dictionary_page_offset)) { - dictionary_coded = false; - break; - } + auto* delete_file_cache = _kv_cache->get<DeleteFile>( + _delet_file_cache_key(delete_file.path), [&]() -> DeleteFile* { + auto* position_delete = new DeleteFile; + TFileRangeDesc delete_file_range; + // must use __set() method to make sure __isset is true + delete_file_range.__set_fs_name(_range.fs_name); + delete_file_range.path = delete_file.path; + delete_file_range.start_offset = 0; + delete_file_range.size = -1; + delete_file_range.file_size = -1; + //read position delete file base on delete_file_range , generate DeleteFile , add DeleteFile to kv_cache + create_status = _read_position_delete_file(&delete_file_range, position_delete); + + if (!create_status) { + return nullptr; } - break; - } - } - DeleteFile* position_delete = new DeleteFile; - bool eof = false; - while (!eof) { - Block block = Block(); - for (int i = 0; i < delete_file_col_names.size(); ++i) { - DataTypePtr data_type = DataTypeFactory::instance().create_data_type( - delete_file_col_types[i], false); - if (delete_file_col_names[i] == ICEBERG_FILE_PATH && dictionary_coded) { - // the dictionary data in ColumnDictI32 is referenced by StringValue, it does keep - // the dictionary data in its life circle, so the upper caller should keep the - // dictionary data alive after ColumnDictI32. - MutableColumnPtr dict_column = ColumnDictI32::create(); - block.insert(ColumnWithTypeAndName(std::move(dict_column), data_type, - delete_file_col_names[i])); - } else { - MutableColumnPtr data_column = data_type->create_column(); - block.insert(ColumnWithTypeAndName(std::move(data_column), data_type, - delete_file_col_names[i])); - } - } - eof = false; - size_t read_rows = 0; - create_status = delete_reader.get_next_block(&block, &read_rows, &eof); - if (!create_status.ok()) { - return nullptr; - } - if (read_rows > 0) { - ColumnPtr path_column = block.get_by_name(ICEBERG_FILE_PATH).column; - DCHECK_EQ(path_column->size(), read_rows); - ColumnPtr pos_column = block.get_by_name(ICEBERG_ROW_POS).column; - using ColumnType = typename PrimitiveTypeTraits<TYPE_BIGINT>::ColumnType; - const int64_t* src_data = - assert_cast<const ColumnType&>(*pos_column).get_data().data(); - IcebergTableReader::PositionDeleteRange range; - if (dictionary_coded) { - range = _get_range(assert_cast<const ColumnDictI32&>(*path_column)); - } else { - range = _get_range(assert_cast<const ColumnString&>(*path_column)); - } - for (int i = 0; i < range.range.size(); ++i) { - std::string key = range.data_file_path[i]; - auto iter = position_delete->find(key); - DeleteRows* delete_rows; - if (iter == position_delete->end()) { - delete_rows = new DeleteRows; - std::unique_ptr<DeleteRows> delete_rows_ptr(delete_rows); - (*position_delete)[key] = std::move(delete_rows_ptr); - } else { - delete_rows = iter->second.get(); - } - const int64_t* cpy_start = src_data + range.range[i].first; - const int64_t cpy_count = range.range[i].second - range.range[i].first; - int64_t origin_size = delete_rows->size(); - delete_rows->resize(origin_size + cpy_count); - int64_t* dest_position = &(*delete_rows)[origin_size]; - memcpy(dest_position, cpy_start, cpy_count * sizeof(int64_t)); - } - } - } - return position_delete; - }); + return position_delete; + }); if (create_status.is<ErrorCode::END_OF_FILE>()) { continue; } else if (!create_status.ok()) { @@ -382,10 +239,7 @@ Status IcebergTableReader::_position_delete( if (row_ids->size() > 0) { delete_rows_array.emplace_back(row_ids); num_delete_rows += row_ids->size(); - if (row_ids->front() >= whole_range.first_row && - row_ids->back() < whole_range.last_row) { - erase_data.emplace_back(delete_file_cache); - } + erase_data.emplace_back(delete_file_cache); } }; delete_file_map.if_contains(data_file_path, get_value); @@ -393,7 +247,7 @@ Status IcebergTableReader::_position_delete( if (num_delete_rows > 0) { SCOPED_TIMER(_iceberg_profile.delete_rows_sort_time); _sort_delete_rows(delete_rows_array, num_delete_rows); - parquet_reader->set_delete_rows(&_delete_rows); + this->set_delete_rows(); COUNTER_UPDATE(_iceberg_profile.num_delete_rows, num_delete_rows); } // the deleted rows are copy out, we can erase them. @@ -428,9 +282,9 @@ IcebergTableReader::PositionDeleteRange IcebergTableReader::_get_range( int index = 0; while (index < read_rows) { StringRef data_path = file_path_column.get_data_at(index); - int left = index; - int right = read_rows - 1; - while (left < right) { + int left = index - 1; + int right = read_rows; + while (left + 1 != right) { int mid = left + (right - left) / 2; if (file_path_column.get_data_at(mid) > data_path) { right = mid; @@ -451,23 +305,23 @@ void IcebergTableReader::_sort_delete_rows(std::vector<std::vector<int64_t>*>& d return; } if (delete_rows_array.size() == 1) { - _delete_rows.resize(num_delete_rows); - memcpy(&_delete_rows[0], &((*delete_rows_array.front())[0]), + _iceberg_delete_rows.resize(num_delete_rows); + memcpy(&_iceberg_delete_rows[0], &((*delete_rows_array.front())[0]), sizeof(int64_t) * num_delete_rows); return; } if (delete_rows_array.size() == 2) { - _delete_rows.resize(num_delete_rows); + _iceberg_delete_rows.resize(num_delete_rows); std::merge(delete_rows_array.front()->begin(), delete_rows_array.front()->end(), delete_rows_array.back()->begin(), delete_rows_array.back()->end(), - _delete_rows.begin()); + _iceberg_delete_rows.begin()); return; } using vec_pair = std::pair<std::vector<int64_t>::iterator, std::vector<int64_t>::iterator>; - _delete_rows.resize(num_delete_rows); - auto row_id_iter = _delete_rows.begin(); - auto iter_end = _delete_rows.end(); + _iceberg_delete_rows.resize(num_delete_rows); + auto row_id_iter = _iceberg_delete_rows.begin(); + auto iter_end = _iceberg_delete_rows.end(); std::vector<vec_pair> rows_array; for (auto rows : delete_rows_array) { if (rows->size() > 0) { @@ -493,6 +347,222 @@ void IcebergTableReader::_sort_delete_rows(std::vector<std::vector<int64_t>*>& d } } +/* + * Generate _all_required_col_names and _not_in_file_col_names. + * + * _all_required_col_names is all the columns required by user sql. + * If the column name has been modified after the data file was written, + * put the old name in data file to _all_required_col_names. + * + * _not_in_file_col_names is all the columns required by user sql but not in the data file. + * e.g. New columns added after this data file was written. + * The columns added with names used by old dropped columns should consider as a missing column, + * which should be in _not_in_file_col_names. + */ +void IcebergTableReader::_gen_file_col_names() { + _all_required_col_names.clear(); + _not_in_file_col_names.clear(); + for (int i = 0; i < _file_col_names.size(); ++i) { + auto name = _file_col_names[i]; + auto iter = _table_col_to_file_col.find(name); + if (iter == _table_col_to_file_col.end()) { + // If the user creates the iceberg table, directly append the parquet file that already exists, + // there is no 'iceberg.schema' field in the footer of parquet, the '_table_col_to_file_col' may be empty. + // Because we are ignoring case, so, it is converted to lowercase here + auto name_low = to_lower(name); + _all_required_col_names.emplace_back(name_low); + if (_has_iceberg_schema) { + _not_in_file_col_names.emplace_back(name); + } else { + _table_col_to_file_col.emplace(name, name_low); + _file_col_to_table_col.emplace(name_low, name); + if (name != name_low) { + _has_schema_change = true; + } + } + } else { + _all_required_col_names.emplace_back(iter->second); + } + } +} + +/* + * Generate _new_colname_to_value_range, by replacing the column name in + * _colname_to_value_range with column name in data file. + */ +void IcebergTableReader::_gen_new_colname_to_value_range() { + for (auto it = _colname_to_value_range->begin(); it != _colname_to_value_range->end(); it++) { + auto iter = _table_col_to_file_col.find(it->first); + if (iter == _table_col_to_file_col.end()) { + _new_colname_to_value_range.emplace(it->first, it->second); + } else { + _new_colname_to_value_range.emplace(iter->second, it->second); + } + } +} + +void IcebergTableReader::_gen_position_delete_file_range(Block& block, DeleteFile* position_delete, + size_t read_rows, + bool file_path_column_dictionary_coded) { + ColumnPtr path_column = block.get_by_name(ICEBERG_FILE_PATH).column; + DCHECK_EQ(path_column->size(), read_rows); + ColumnPtr pos_column = block.get_by_name(ICEBERG_ROW_POS).column; + using ColumnType = typename PrimitiveTypeTraits<TYPE_BIGINT>::ColumnType; + const int64_t* src_data = assert_cast<const ColumnType&>(*pos_column).get_data().data(); + IcebergTableReader::PositionDeleteRange range; + if (file_path_column_dictionary_coded) { + range = _get_range(assert_cast<const ColumnDictI32&>(*path_column)); + } else { + range = _get_range(assert_cast<const ColumnString&>(*path_column)); + } + for (int i = 0; i < range.range.size(); ++i) { + std::string key = range.data_file_path[i]; + auto iter = position_delete->find(key); + DeleteRows* delete_rows; + if (iter == position_delete->end()) { + delete_rows = new DeleteRows; + std::unique_ptr<DeleteRows> delete_rows_ptr(delete_rows); + (*position_delete)[key] = std::move(delete_rows_ptr); + } else { + delete_rows = iter->second.get(); + } + const int64_t* cpy_start = src_data + range.range[i].first; + const int64_t cpy_count = range.range[i].second - range.range[i].first; + int64_t origin_size = delete_rows->size(); + delete_rows->resize(origin_size + cpy_count); + int64_t* dest_position = &(*delete_rows)[origin_size]; + memcpy(dest_position, cpy_start, cpy_count * sizeof(int64_t)); + } +} + +Status IcebergParquetReader::init_reader( + const std::vector<std::string>& file_col_names, + const std::unordered_map<int, std::string>& col_id_name_map, + std::unordered_map<std::string, ColumnValueRangeType>* colname_to_value_range, + const VExprContextSPtrs& conjuncts, const TupleDescriptor* tuple_descriptor, + const RowDescriptor* row_descriptor, + const std::unordered_map<std::string, int>* colname_to_slot_id, + const VExprContextSPtrs* not_single_slot_filter_conjuncts, + const std::unordered_map<int, VExprContextSPtrs>* slot_id_to_filter_conjuncts) { + _file_format = Fileformat::PARQUET; + ParquetReader* parquet_reader = static_cast<ParquetReader*>(_file_format_reader.get()); + _col_id_name_map = col_id_name_map; + _file_col_names = file_col_names; + _colname_to_value_range = colname_to_value_range; + auto parquet_meta_kv = parquet_reader->get_metadata_key_values(); + static_cast<void>(_gen_col_name_maps(parquet_meta_kv)); + _gen_file_col_names(); + _gen_new_colname_to_value_range(); + parquet_reader->set_table_to_file_col_map(_table_col_to_file_col); + parquet_reader->iceberg_sanitize(_all_required_col_names); + Status status = parquet_reader->init_reader( + _all_required_col_names, _not_in_file_col_names, &_new_colname_to_value_range, + conjuncts, tuple_descriptor, row_descriptor, colname_to_slot_id, + not_single_slot_filter_conjuncts, slot_id_to_filter_conjuncts); + + return status; +} + +Status IcebergParquetReader ::_read_position_delete_file(const TFileRangeDesc* delete_range, + DeleteFile* position_delete) { + ParquetReader parquet_delete_reader( + _profile, _params, *delete_range, READ_DELETE_FILE_BATCH_SIZE, + const_cast<cctz::time_zone*>(&_state->timezone_obj()), _io_ctx, _state); + + RETURN_IF_ERROR(parquet_delete_reader.open()); + RETURN_IF_ERROR(parquet_delete_reader.init_reader(delete_file_col_names, {}, nullptr, {}, + nullptr, nullptr, nullptr, nullptr, nullptr, + false)); + + std::unordered_map<std::string, std::tuple<std::string, const SlotDescriptor*>> + partition_columns; + std::unordered_map<std::string, VExprContextSPtr> missing_columns; + static_cast<void>(parquet_delete_reader.set_fill_columns(partition_columns, missing_columns)); + + const tparquet::FileMetaData* meta_data = parquet_delete_reader.get_meta_data(); + bool dictionary_coded = true; + for (int j = 0; j < meta_data->row_groups.size(); ++j) { + auto& column_chunk = meta_data->row_groups[j].columns[ICEBERG_FILE_PATH_INDEX]; + if (!(column_chunk.__isset.meta_data && + column_chunk.meta_data.__isset.dictionary_page_offset)) { + dictionary_coded = false; + break; + } + } + DataTypePtr data_type_file_path {new DataTypeString}; + DataTypePtr data_type_pos {new DataTypeInt64}; + bool eof = false; + while (!eof) { + Block block = {dictionary_coded + ? ColumnWithTypeAndName {ColumnDictI32::create(), + data_type_file_path, ICEBERG_FILE_PATH} + : ColumnWithTypeAndName {data_type_file_path, ICEBERG_FILE_PATH}, + + {data_type_pos, ICEBERG_ROW_POS}}; + size_t read_rows = 0; + RETURN_IF_ERROR(parquet_delete_reader.get_next_block(&block, &read_rows, &eof)); + + if (read_rows <= 0) { + break; + } + _gen_position_delete_file_range(block, position_delete, read_rows, dictionary_coded); + } + return Status::OK(); +}; + +Status IcebergOrcReader::init_reader( + const std::vector<std::string>& file_col_names, + const std::unordered_map<int, std::string>& col_id_name_map, + std::unordered_map<std::string, ColumnValueRangeType>* colname_to_value_range, + const VExprContextSPtrs& conjuncts, const TupleDescriptor* tuple_descriptor, + const RowDescriptor* row_descriptor, + const std::unordered_map<std::string, int>* colname_to_slot_id, + const VExprContextSPtrs* not_single_slot_filter_conjuncts, + const std::unordered_map<int, VExprContextSPtrs>* slot_id_to_filter_conjuncts) { + _file_format = Fileformat::ORC; + auto* orc_reader = static_cast<OrcReader*>(_file_format_reader.get()); + _col_id_name_map = col_id_name_map; + _file_col_names = file_col_names; + _colname_to_value_range = colname_to_value_range; + + RETURN_IF_ERROR(_gen_col_name_maps(orc_reader)); + _gen_file_col_names(); + _gen_new_colname_to_value_range(); + orc_reader->set_table_col_to_file_col(_table_col_to_file_col); + Status status = + orc_reader->init_reader(&_all_required_col_names, &_new_colname_to_value_range, + conjuncts, false, tuple_descriptor, row_descriptor, + not_single_slot_filter_conjuncts, slot_id_to_filter_conjuncts); + return status; +} + +Status IcebergOrcReader::_read_position_delete_file(const TFileRangeDesc* delete_range, + DeleteFile* position_delete) { + OrcReader orc_delete_reader(_profile, _state, _params, *delete_range, + READ_DELETE_FILE_BATCH_SIZE, _state->timezone(), _io_ctx); + std::unordered_map<std::string, ColumnValueRangeType> colname_to_value_range; + Status init_status = orc_delete_reader.init_reader( + &delete_file_col_names, &colname_to_value_range, {}, false, {}, {}, nullptr, nullptr); + + std::unordered_map<std::string, std::tuple<std::string, const SlotDescriptor*>> + partition_columns; + std::unordered_map<std::string, VExprContextSPtr> missing_columns; + static_cast<void>(orc_delete_reader.set_fill_columns(partition_columns, missing_columns)); + + bool eof = false; + DataTypePtr data_type_file_path {new DataTypeString}; + DataTypePtr data_type_pos {new DataTypeInt64}; + while (!eof) { + Block block = {{data_type_file_path, ICEBERG_FILE_PATH}, {data_type_pos, ICEBERG_ROW_POS}}; + + size_t read_rows = 0; + RETURN_IF_ERROR(orc_delete_reader.get_next_block(&block, &read_rows, &eof)); + + _gen_position_delete_file_range(block, position_delete, read_rows, false); + } + return Status::OK(); +} + /* * To support schema evolution, Iceberg write the column id to column name map to * parquet file key_value_metadata. @@ -507,7 +577,7 @@ void IcebergTableReader::_sort_delete_rows(std::vector<std::vector<int64_t>*>& d * 1. col1_new -> col1 * 2. col1 -> col1_new */ -Status IcebergTableReader::_gen_col_name_maps(std::vector<tparquet::KeyValue> parquet_meta_kv) { +Status IcebergParquetReader::_gen_col_name_maps(std::vector<tparquet::KeyValue> parquet_meta_kv) { for (int i = 0; i < parquet_meta_kv.size(); ++i) { tparquet::KeyValue kv = parquet_meta_kv[i]; if (kv.key == "iceberg.schema") { @@ -545,58 +615,30 @@ Status IcebergTableReader::_gen_col_name_maps(std::vector<tparquet::KeyValue> pa return Status::OK(); } -/* - * Generate _all_required_col_names and _not_in_file_col_names. - * - * _all_required_col_names is all the columns required by user sql. - * If the column name has been modified after the data file was written, - * put the old name in data file to _all_required_col_names. - * - * _not_in_file_col_names is all the columns required by user sql but not in the data file. - * e.g. New columns added after this data file was written. - * The columns added with names used by old dropped columns should consider as a missing column, - * which should be in _not_in_file_col_names. - */ -void IcebergTableReader::_gen_file_col_names() { - _all_required_col_names.clear(); - _not_in_file_col_names.clear(); - for (int i = 0; i < _file_col_names.size(); ++i) { - auto name = _file_col_names[i]; - auto iter = _table_col_to_file_col.find(name); - if (iter == _table_col_to_file_col.end()) { - // If the user creates the iceberg table, directly append the parquet file that already exists, - // there is no 'iceberg.schema' field in the footer of parquet, the '_table_col_to_file_col' may be empty. - // Because we are ignoring case, so, it is converted to lowercase here - auto name_low = to_lower(name); - _all_required_col_names.emplace_back(name_low); - if (_has_iceberg_schema) { - _not_in_file_col_names.emplace_back(name); - } else { - _table_col_to_file_col.emplace(name, name_low); - _file_col_to_table_col.emplace(name_low, name); - if (name != name_low) { - _has_schema_change = true; - } - } - } else { - _all_required_col_names.emplace_back(iter->second); +Status IcebergOrcReader::_gen_col_name_maps(OrcReader* orc_reader) { + std::vector<std::string> col_names; + std::vector<uint64_t> col_ids; + RETURN_IF_ERROR( + orc_reader->get_schema_col_name_attribute(&col_names, &col_ids, ICEBERG_ORC_ATTRIBUTE)); + _has_iceberg_schema = true; + _table_col_to_file_col.clear(); + _file_col_to_table_col.clear(); + for (size_t i = 0; i < col_ids.size(); i++) { + auto col_id = col_ids[i]; + auto& file_col_name = col_names[i]; + + if (_col_id_name_map.find(col_id) == _col_id_name_map.end()) { + _has_schema_change = true; + continue; } - } -} - -/* - * Generate _new_colname_to_value_range, by replacing the column name in - * _colname_to_value_range with column name in data file. - */ -void IcebergTableReader::_gen_new_colname_to_value_range() { - for (auto it = _colname_to_value_range->begin(); it != _colname_to_value_range->end(); it++) { - auto iter = _table_col_to_file_col.find(it->first); - if (iter == _table_col_to_file_col.end()) { - _new_colname_to_value_range.emplace(it->first, it->second); - } else { - _new_colname_to_value_range.emplace(iter->second, it->second); + auto& table_col_name = _col_id_name_map[col_id]; + _table_col_to_file_col.emplace(table_col_name, file_col_name); + _file_col_to_table_col.emplace(file_col_name, table_col_name); + if (table_col_name != file_col_name) { + _has_schema_change = true; } } + return Status::OK(); } } // namespace doris::vectorized diff --git a/be/src/vec/exec/format/table/iceberg_reader.h b/be/src/vec/exec/format/table/iceberg_reader.h index 04be8d53f24..50c8d31bed9 100644 --- a/be/src/vec/exec/format/table/iceberg_reader.h +++ b/be/src/vec/exec/format/table/iceberg_reader.h @@ -28,10 +28,16 @@ #include "common/status.h" #include "exec/olap_common.h" +#include "runtime/define_primitive_type.h" +#include "runtime/primitive_type.h" +#include "runtime/runtime_state.h" +#include "runtime/types.h" #include "table_format_reader.h" #include "util/runtime_profile.h" #include "vec/columns/column_dictionary.h" - +#include "vec/exec/format/orc/vorc_reader.h" +#include "vec/exec/format/parquet/vparquet_reader.h" +#include "vec/exprs/vslot_ref.h" namespace tparquet { class KeyValue; } // namespace tparquet @@ -60,8 +66,6 @@ class ShardedKVCache; class VExprContext; class IcebergTableReader : public TableFormatReader { - ENABLE_FACTORY_CREATOR(IcebergTableReader); - public: struct PositionDeleteRange { std::vector<std::string> data_file_path; @@ -74,42 +78,40 @@ public: int64_t push_down_count); ~IcebergTableReader() override = default; - Status init_row_filters(const TFileRangeDesc& range) override; + Status init_row_filters(const TFileRangeDesc& range) final; - Status get_next_block(Block* block, size_t* read_rows, bool* eof) override; + Status get_next_block(Block* block, size_t* read_rows, bool* eof) final; Status set_fill_columns( const std::unordered_map<std::string, std::tuple<std::string, const SlotDescriptor*>>& partition_columns, - const std::unordered_map<std::string, VExprContextSPtr>& missing_columns) override; + const std::unordered_map<std::string, VExprContextSPtr>& missing_columns) final; - bool fill_all_columns() const override; + bool fill_all_columns() const final; Status get_columns(std::unordered_map<std::string, TypeDescriptor>* name_to_type, - std::unordered_set<std::string>* missing_cols) override; + std::unordered_set<std::string>* missing_cols) final; - Status init_reader( - const std::vector<std::string>& file_col_names, - const std::unordered_map<int, std::string>& col_id_name_map, - std::unordered_map<std::string, ColumnValueRangeType>* colname_to_value_range, - const VExprContextSPtrs& conjuncts, const TupleDescriptor* tuple_descriptor, - const RowDescriptor* row_descriptor, - const std::unordered_map<std::string, int>* colname_to_slot_id, - const VExprContextSPtrs* not_single_slot_filter_conjuncts, - const std::unordered_map<int, VExprContextSPtrs>* slot_id_to_filter_conjuncts); + Status _position_delete_base(const std::vector<TIcebergDeleteFileDesc>& delete_files); enum { DATA, POSITION_DELETE, EQUALITY_DELETE }; + enum Fileformat { NONE, PARQUET, ORC, AVRO }; -private: + virtual void set_delete_rows() = 0; + +protected: struct IcebergProfile { RuntimeProfile::Counter* num_delete_files; RuntimeProfile::Counter* num_delete_rows; RuntimeProfile::Counter* delete_files_read_time; RuntimeProfile::Counter* delete_rows_sort_time; }; - - Status _position_delete(const std::vector<TIcebergDeleteFileDesc>& delete_files); - + using DeleteRows = std::vector<int64_t>; + using DeleteFile = phmap::parallel_flat_hash_map< + std::string, std::unique_ptr<DeleteRows>, std::hash<std::string>, + std::equal_to<std::string>, + std::allocator<std::pair<const std::string, std::unique_ptr<DeleteRows>>>, 8, + std::mutex>; /** * https://iceberg.apache.org/spec/#position-delete-files * The rows in the delete file must be sorted by file_path then position to optimize filtering rows while scanning. @@ -123,8 +125,8 @@ private: PositionDeleteRange _get_range(const ColumnString& file_path_column); - Status _gen_col_name_maps(std::vector<tparquet::KeyValue> parquet_meta_kv); void _gen_file_col_names(); + void _gen_new_colname_to_value_range(); static std::string _delet_file_cache_key(const std::string& path) { return "delete_" + path; } @@ -135,7 +137,7 @@ private: // owned by scan node ShardedKVCache* _kv_cache; IcebergProfile _iceberg_profile; - std::vector<int64_t> _delete_rows; + std::vector<int64_t> _iceberg_delete_rows; // col names from _file_slot_descs std::vector<std::string> _file_col_names; // file column name to table column name map. For iceberg schema evolution. @@ -143,13 +145,13 @@ private: // table column name to file column name map. For iceberg schema evolution. std::unordered_map<std::string, std::string> _table_col_to_file_col; std::unordered_map<std::string, ColumnValueRangeType>* _colname_to_value_range; - // copy from _colname_to_value_range with new column name that is in parquet file, to support schema evolution. + // copy from _colname_to_value_range with new column name that is in parquet/orc file, to support schema evolution. std::unordered_map<std::string, ColumnValueRangeType> _new_colname_to_value_range; // column id to name map. Collect from FE slot descriptor. std::unordered_map<int, std::string> _col_id_name_map; - // col names in the parquet file + // col names in the parquet,orc file std::vector<std::string> _all_required_col_names; - // col names in table but not in parquet file + // col names in table but not in parquet,orc file std::vector<std::string> _not_in_file_col_names; io::IOContext* _io_ctx; @@ -157,6 +159,88 @@ private: bool _has_iceberg_schema = false; int64_t _remaining_push_down_count; + Fileformat _file_format = Fileformat::NONE; + + const int64_t MIN_SUPPORT_DELETE_FILES_VERSION = 2; + const std::string ICEBERG_ROW_POS = "pos"; + const std::string ICEBERG_FILE_PATH = "file_path"; + const std::vector<std::string> delete_file_col_names {ICEBERG_ROW_POS, ICEBERG_FILE_PATH}; + const std::vector<TypeDescriptor> delete_file_col_types {{TYPE_STRING}, {TYPE_BIGINT}}; + const int ICEBERG_FILE_PATH_INDEX = 0; + const int ICEBERG_FILE_POS_INDEX = 1; + const int READ_DELETE_FILE_BATCH_SIZE = 102400; + + //Read position_delete_file TFileRangeDesc, generate DeleteFile + virtual Status _read_position_delete_file(const TFileRangeDesc*, DeleteFile*) = 0; + + void _gen_position_delete_file_range(Block& block, DeleteFile* const position_delete, + size_t read_rows, bool file_path_column_dictionary_coded); +}; + +class IcebergParquetReader final : public IcebergTableReader { +public: + ENABLE_FACTORY_CREATOR(IcebergParquetReader); + + IcebergParquetReader(std::unique_ptr<GenericReader> file_format_reader, RuntimeProfile* profile, + RuntimeState* state, const TFileScanRangeParams& params, + const TFileRangeDesc& range, ShardedKVCache* kv_cache, + io::IOContext* io_ctx, int64_t push_down_count) + : IcebergTableReader(std::move(file_format_reader), profile, state, params, range, + kv_cache, io_ctx, push_down_count) {} + Status init_reader( + const std::vector<std::string>& file_col_names, + const std::unordered_map<int, std::string>& col_id_name_map, + std::unordered_map<std::string, ColumnValueRangeType>* colname_to_value_range, + const VExprContextSPtrs& conjuncts, const TupleDescriptor* tuple_descriptor, + const RowDescriptor* row_descriptor, + const std::unordered_map<std::string, int>* colname_to_slot_id, + const VExprContextSPtrs* not_single_slot_filter_conjuncts, + const std::unordered_map<int, VExprContextSPtrs>* slot_id_to_filter_conjuncts); + + Status _read_position_delete_file(const TFileRangeDesc* delete_range, + DeleteFile* position_delete) override; + + void set_delete_rows() override { + auto* parquet_reader = (ParquetReader*)(_file_format_reader.get()); + parquet_reader->set_delete_rows(&_iceberg_delete_rows); + } + + Status _gen_col_name_maps(std::vector<tparquet::KeyValue> parquet_meta_kv); +}; +class IcebergOrcReader final : public IcebergTableReader { +public: + ENABLE_FACTORY_CREATOR(IcebergOrcReader); + + Status _read_position_delete_file(const TFileRangeDesc* delete_range, + DeleteFile* position_delete) override; + + IcebergOrcReader(std::unique_ptr<GenericReader> file_format_reader, RuntimeProfile* profile, + RuntimeState* state, const TFileScanRangeParams& params, + const TFileRangeDesc& range, ShardedKVCache* kv_cache, io::IOContext* io_ctx, + int64_t push_down_count) + : IcebergTableReader(std::move(file_format_reader), profile, state, params, range, + kv_cache, io_ctx, push_down_count) {} + + void set_delete_rows() override { + auto* orc_reader = (OrcReader*)_file_format_reader.get(); + orc_reader->set_position_delete_rowids(&_iceberg_delete_rows); + } + + Status init_reader( + const std::vector<std::string>& file_col_names, + const std::unordered_map<int, std::string>& col_id_name_map, + std::unordered_map<std::string, ColumnValueRangeType>* colname_to_value_range, + const VExprContextSPtrs& conjuncts, const TupleDescriptor* tuple_descriptor, + const RowDescriptor* row_descriptor, + const std::unordered_map<std::string, int>* colname_to_slot_id, + const VExprContextSPtrs* not_single_slot_filter_conjuncts, + const std::unordered_map<int, VExprContextSPtrs>* slot_id_to_filter_conjuncts); + + Status _gen_col_name_maps(OrcReader* orc_reader); + +private: + const std::string ICEBERG_ORC_ATTRIBUTE = "iceberg.id"; }; + } // namespace vectorized } // namespace doris diff --git a/be/src/vec/exec/scan/vfile_scanner.cpp b/be/src/vec/exec/scan/vfile_scanner.cpp index fb3eaec0789..c4bb8e20a15 100644 --- a/be/src/vec/exec/scan/vfile_scanner.cpp +++ b/be/src/vec/exec/scan/vfile_scanner.cpp @@ -820,15 +820,16 @@ Status VFileScanner::_get_next_reader() { } if (range.__isset.table_format_params && range.table_format_params.table_format_type == "iceberg") { - std::unique_ptr<IcebergTableReader> iceberg_reader = - IcebergTableReader::create_unique(std::move(parquet_reader), _profile, - _state, *_params, range, _kv_cache, - _io_ctx.get(), _get_push_down_count()); + std::unique_ptr<IcebergParquetReader> iceberg_reader = + IcebergParquetReader::create_unique(std::move(parquet_reader), _profile, + _state, *_params, range, _kv_cache, + _io_ctx.get(), _get_push_down_count()); init_status = iceberg_reader->init_reader( _file_col_names, _col_id_name_map, _colname_to_value_range, _push_down_conjuncts, _real_tuple_desc, _default_val_row_desc.get(), _col_name_to_slot_id, &_not_single_slot_filter_conjuncts, &_slot_id_to_filter_conjuncts); + RETURN_IF_ERROR(iceberg_reader->init_row_filters(range)); _cur_reader = std::move(iceberg_reader); } else { @@ -870,6 +871,20 @@ Status VFileScanner::_get_next_reader() { &_not_single_slot_filter_conjuncts, &_slot_id_to_filter_conjuncts); RETURN_IF_ERROR(tran_orc_reader->init_row_filters(range)); _cur_reader = std::move(tran_orc_reader); + } else if (range.__isset.table_format_params && + range.table_format_params.table_format_type == "iceberg") { + std::unique_ptr<IcebergOrcReader> iceberg_reader = IcebergOrcReader::create_unique( + std::move(orc_reader), _profile, _state, *_params, range, _kv_cache, + _io_ctx.get(), _get_push_down_count()); + + init_status = iceberg_reader->init_reader( + _file_col_names, _col_id_name_map, _colname_to_value_range, + _push_down_conjuncts, _real_tuple_desc, _default_val_row_desc.get(), + _col_name_to_slot_id, &_not_single_slot_filter_conjuncts, + &_slot_id_to_filter_conjuncts); + + RETURN_IF_ERROR(iceberg_reader->init_row_filters(range)); + _cur_reader = std::move(iceberg_reader); } else { init_status = orc_reader->init_reader( &_file_col_names, _colname_to_value_range, _push_down_conjuncts, false, diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/IcebergUtils.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/IcebergUtils.java index 70c384a0c4b..05519f84595 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/IcebergUtils.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/IcebergUtils.java @@ -37,7 +37,10 @@ import org.apache.doris.analysis.Subquery; import org.apache.doris.catalog.ArrayType; import org.apache.doris.catalog.Column; import org.apache.doris.catalog.Env; +import org.apache.doris.catalog.MapType; import org.apache.doris.catalog.ScalarType; +import org.apache.doris.catalog.StructField; +import org.apache.doris.catalog.StructType; import org.apache.doris.catalog.Type; import org.apache.doris.common.UserException; import org.apache.doris.common.util.TimeUtils; @@ -65,6 +68,7 @@ import java.util.ArrayList; import java.util.List; import java.util.Locale; import java.util.Map; +import java.util.stream.Collectors; /** * Iceberg utils @@ -503,8 +507,17 @@ public class IcebergUtils { Types.ListType list = (Types.ListType) type; return ArrayType.create(icebergTypeToDorisType(list.elementType()), true); case MAP: + Types.MapType map = (Types.MapType) type; + return new MapType( + icebergTypeToDorisType(map.keyType()), + icebergTypeToDorisType(map.valueType()) + ); case STRUCT: - return Type.UNSUPPORTED; + Types.StructType struct = (Types.StructType) type; + ArrayList<StructField> nestedTypes = struct.fields().stream().map( + x -> new StructField(x.name(), icebergTypeToDorisType(x.type())) + ).collect(Collectors.toCollection(ArrayList::new)); + return new StructType(nestedTypes); default: throw new IllegalArgumentException("Cannot transform unknown type: " + type); } diff --git a/regression-test/data/external_table_p2/iceberg/iceberg_complex_type.out b/regression-test/data/external_table_p2/iceberg/iceberg_complex_type.out new file mode 100644 index 00000000000..2250381e6f8 --- /dev/null +++ b/regression-test/data/external_table_p2/iceberg/iceberg_complex_type.out @@ -0,0 +1,165 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !parquet_v1_1 -- +id INT Yes true \N +col2 ARRAY<ARRAY<ARRAY<ARRAY<ARRAY<INT>>>>> Yes true \N +col3 MAP<ARRAY<FLOAT>,MAP<INT,MAP<INT,FLOAT>>> Yes true \N +col4 STRUCT<x:ARRAY<INT>,y:ARRAY<DOUBLE>,z:MAP<BOOLEAN,TEXT>> Yes true \N +col5 MAP<INT,MAP<INT,MAP<INT,MAP<INT,MAP<FLOAT,MAP<DOUBLE,STRUCT<x:INT,y:ARRAY<DOUBLE>>>>>>>> Yes true \N +col6 STRUCT<xx:ARRAY<INT>,yy:ARRAY<MAP<DOUBLE,FLOAT>>,zz:STRUCT<xxx:STRUCT<xxxx:STRUCT<xxxxx:DECIMAL(13, 2)>>>> Yes true \N + +-- !parquet_v1_2 -- +1 [[[[[2, 2, 3, 9]]]]] {[2]:{2:{2:2}}} {"x": [2], "y": [2], "z": {1:"example"}} {2:{2:{2:{2:{2:{2:{"x": 2, "y": [2]}}}}}}} {"xx": [2, 2, 3, 9], "yy": [{2:2}], "zz": {"xxx": {"xxxx": {"xxxxx": 10123.33}}}} +2 [[[[[2, 2, 3, 9], [2, 2, 3, 9], [2, 2, 3, 9]]]]] {[2]:{2:{2:2}}} {"x": [2], "y": [2], "z": {1:"example"}} {2:{2:{2:{2:{2:{2:{"x": 2, "y": [2]}}}}}}} {"xx": [2, 2, 3, 9], "yy": [{2:2}], "zz": {"xxx": {"xxxx": {"xxxxx": 10123.33}}}} +3 [[[[[2, 2, 3, 9]], [[2, 2, 3, 9], [2, 2, 3, 9], [2, 2, 3, 9]]], [[[2, 2, 3, 9]], [[2, 2, 3, 9], [2, 2, 3, 9], [2, 2, 3, 9]]]], [[[[2, 2, 3, 9]], [[2, 2, 3, 9], [2, 2, 3, 9], [2, 2, 3, 9]]], [[[2, 2, 3, 9]], [[2, 2, 3, 9], [2, 2, 3, 9], [2, 2, 3, 9]]]]] {[2]:{2:{2:2}}} {"x": [2], "y": [2], "z": {1:"example"}} {2:{2:{2:{2:{2:{2:{"x": 2, "y": [2]}}}}}}} {"xx": [2, 2, 3, 9], "yy": [{2:2}], "zz": {"xxx": {"xxxx": {"xxxxx": 10123.33}}}} +4 [[[[[2, 2, 3, 9]], [[2, 2, 3, 9], [2, 2, 3, 9], [2, 2, 3, 9]]]]] {[2]:{2:{2:222}}} {"x": [2], "y": [2, 2, 22935, 99, 59, 955], "z": {1:"example"}} {2:{2:{2:{2:{2:{2:{"x": 2, "y": [2]}}}}}}} {"xx": [2, 2, 3, 9], "yy": [{2:2}], "zz": {"xxx": {"xxxx": {"xxxxx": 10123.33}}}} +5 [[[[[2, 2, 3, 9]]]]] {[2]:{2:{2:2}}} {"x": [2], "y": [2], "z": {1:"example"}} {2:{2:{2:{2:{2:{2:{"x": 2, "y": [2, 239, 39293259, 2223, 23, 59, 29, 9353]}}}}}}} {"xx": [2, 2, 3, 9], "yy": [{2:2}], "zz": {"xxx": {"xxxx": {"xxxxx": 10123.33}}}} +6 [[[[[2, 2, 3, 9]]]]] {[2]:{2:{2:2}}} {"x": [2], "y": [2], "z": {1:"example"}} {2:{2:{2:{2:{2:{2:{"x": 2, "y": [2]}}}}}}} {"xx": [2, 2, 3, 9], "yy": [{2:2}, {2:2}, {2:2}, {2:4}, {2:4}, {2:4}, {2:4}, {2:4}, {2:4}, {2:4}], "zz": {"xxx": {"xxxx": {"xxxxx": 10123.33}}}} + +-- !parquet_v1_3 -- +6 + +-- !parquet_v1_4 -- +1 +1 +2 +1 +1 +1 + +-- !parquet_v1_5 -- +[[2]] +[[2]] +[[2]] +[[2]] +[[2]] +[[2]] + +-- !parquet_v1_6 -- + +-- !parquet_v1_7 -- +6 1 +5 1 + +-- !parquet_v2_1 -- +id INT Yes true \N +col2 ARRAY<ARRAY<ARRAY<ARRAY<ARRAY<INT>>>>> Yes true \N +col3 MAP<ARRAY<FLOAT>,MAP<INT,MAP<INT,FLOAT>>> Yes true \N +col4 STRUCT<x:ARRAY<INT>,y:ARRAY<DOUBLE>,z:MAP<BOOLEAN,TEXT>> Yes true \N +col5 MAP<INT,MAP<INT,MAP<INT,MAP<INT,MAP<FLOAT,MAP<DOUBLE,STRUCT<x:INT,y:ARRAY<DOUBLE>>>>>>>> Yes true \N +col6 STRUCT<xx:ARRAY<INT>,yy:ARRAY<MAP<DOUBLE,FLOAT>>,zz:STRUCT<xxx:STRUCT<xxxx:STRUCT<xxxxx:DECIMAL(13, 2)>>>> Yes true \N + +-- !parquet_v2_2 -- +1 [[[[[2, 2, 3, 9]]]]] {[2]:{2:{2:2}}} {"x": [2], "y": [2], "z": {1:"example"}} {2:{2:{2:{2:{2:{2:{"x": 2, "y": [2]}}}}}}} {"xx": [2, 2, 3, 9], "yy": [{2:2}], "zz": {"xxx": {"xxxx": {"xxxxx": 10123.33}}}} +2 [[[[[2, 2, 3, 9], [2, 2, 3, 9], [2, 2, 3, 9]]]]] {[2]:{2:{2:2}}} {"x": [2], "y": [2], "z": {1:"example"}} {2:{2:{2:{2:{2:{2:{"x": 2, "y": [2]}}}}}}} {"xx": [2, 2, 3, 9], "yy": [{2:2}], "zz": {"xxx": {"xxxx": {"xxxxx": 10123.33}}}} +3 [[[[[2, 2, 3, 9]], [[2, 2, 3, 9], [2, 2, 3, 9], [2, 2, 3, 9]]], [[[2, 2, 3, 9]], [[2, 2, 3, 9], [2, 2, 3, 9], [2, 2, 3, 9]]]], [[[[2, 2, 3, 9]], [[2, 2, 3, 9], [2, 2, 3, 9], [2, 2, 3, 9]]], [[[2, 2, 3, 9]], [[2, 2, 3, 9], [2, 2, 3, 9], [2, 2, 3, 9]]]]] {[2]:{2:{2:2}}} {"x": [2], "y": [2], "z": {1:"example"}} {2:{2:{2:{2:{2:{2:{"x": 2, "y": [2]}}}}}}} {"xx": [2, 2, 3, 9], "yy": [{2:2}], "zz": {"xxx": {"xxxx": {"xxxxx": 10123.33}}}} +4 [[[[[2, 2, 3, 9]], [[2, 2, 3, 9], [2, 2, 3, 9], [2, 2, 3, 9]]]]] {[2]:{2:{2:222}}} {"x": [2], "y": [2, 2, 22935, 99, 59, 955], "z": {1:"example"}} {2:{2:{2:{2:{2:{2:{"x": 2, "y": [2]}}}}}}} {"xx": [2, 2, 3, 9], "yy": [{2:2}], "zz": {"xxx": {"xxxx": {"xxxxx": 10123.33}}}} +5 [[[[[2, 2, 3, 9]]]]] {[2]:{2:{2:2}}} {"x": [2], "y": [2], "z": {1:"example"}} {2:{2:{2:{2:{2:{2:{"x": 2, "y": [2, 239, 39293259, 2223, 23, 59, 29, 9353]}}}}}}} {"xx": [2, 2, 3, 9], "yy": [{2:2}], "zz": {"xxx": {"xxxx": {"xxxxx": 10123.33}}}} +6 [[[[[2, 2, 3, 9]]]]] {[2]:{2:{2:2}}} {"x": [2], "y": [2], "z": {1:"example"}} {2:{2:{2:{2:{2:{2:{"x": 2, "y": [2]}}}}}}} {"xx": [2, 2, 3, 9], "yy": [{2:2}, {2:2}, {2:2}, {2:4}, {2:4}, {2:4}, {2:4}, {2:4}, {2:4}, {2:4}], "zz": {"xxx": {"xxxx": {"xxxxx": 10123.33}}}} + +-- !parquet_v2_3 -- +6 + +-- !parquet_v2_4 -- +1 +1 +2 +1 +1 +1 + +-- !parquet_v2_5 -- +[[2]] +[[2]] +[[2]] +[[2]] +[[2]] +[[2]] + +-- !parquet_v2_6 -- + +-- !parquet_v2_7 -- +6 1 +5 1 + +-- !orc_v1_1 -- +id INT Yes true \N +col2 ARRAY<ARRAY<ARRAY<ARRAY<ARRAY<INT>>>>> Yes true \N +col3 MAP<ARRAY<FLOAT>,MAP<INT,MAP<INT,FLOAT>>> Yes true \N +col4 STRUCT<x:ARRAY<INT>,y:ARRAY<DOUBLE>,z:MAP<BOOLEAN,TEXT>> Yes true \N +col5 MAP<INT,MAP<INT,MAP<INT,MAP<INT,MAP<FLOAT,MAP<DOUBLE,STRUCT<x:INT,y:ARRAY<DOUBLE>>>>>>>> Yes true \N +col6 STRUCT<xx:ARRAY<INT>,yy:ARRAY<MAP<DOUBLE,FLOAT>>,zz:STRUCT<xxx:STRUCT<xxxx:STRUCT<xxxxx:DECIMAL(13, 2)>>>> Yes true \N + +-- !orc_v1_2 -- +1 [[[[[2, 2, 3, 9]]]]] {[2]:{2:{2:2}}} {"x": [2], "y": [2], "z": {1:"example"}} {2:{2:{2:{2:{2:{2:{"x": 2, "y": [2]}}}}}}} {"xx": [2, 2, 3, 9], "yy": [{2:2}], "zz": {"xxx": {"xxxx": {"xxxxx": 10123.33}}}} +2 [[[[[2, 2, 3, 9], [2, 2, 3, 9], [2, 2, 3, 9]]]]] {[2]:{2:{2:2}}} {"x": [2], "y": [2], "z": {1:"example"}} {2:{2:{2:{2:{2:{2:{"x": 2, "y": [2]}}}}}}} {"xx": [2, 2, 3, 9], "yy": [{2:2}], "zz": {"xxx": {"xxxx": {"xxxxx": 10123.33}}}} +3 [[[[[2, 2, 3, 9]], [[2, 2, 3, 9], [2, 2, 3, 9], [2, 2, 3, 9]]], [[[2, 2, 3, 9]], [[2, 2, 3, 9], [2, 2, 3, 9], [2, 2, 3, 9]]]], [[[[2, 2, 3, 9]], [[2, 2, 3, 9], [2, 2, 3, 9], [2, 2, 3, 9]]], [[[2, 2, 3, 9]], [[2, 2, 3, 9], [2, 2, 3, 9], [2, 2, 3, 9]]]]] {[2]:{2:{2:2}}} {"x": [2], "y": [2], "z": {1:"example"}} {2:{2:{2:{2:{2:{2:{"x": 2, "y": [2]}}}}}}} {"xx": [2, 2, 3, 9], "yy": [{2:2}], "zz": {"xxx": {"xxxx": {"xxxxx": 10123.33}}}} +4 [[[[[2, 2, 3, 9]], [[2, 2, 3, 9], [2, 2, 3, 9], [2, 2, 3, 9]]]]] {[2]:{2:{2:222}}} {"x": [2], "y": [2, 2, 22935, 99, 59, 955], "z": {1:"example"}} {2:{2:{2:{2:{2:{2:{"x": 2, "y": [2]}}}}}}} {"xx": [2, 2, 3, 9], "yy": [{2:2}], "zz": {"xxx": {"xxxx": {"xxxxx": 10123.33}}}} +5 [[[[[2, 2, 3, 9]]]]] {[2]:{2:{2:2}}} {"x": [2], "y": [2], "z": {1:"example"}} {2:{2:{2:{2:{2:{2:{"x": 2, "y": [2, 239, 39293259, 2223, 23, 59, 29, 9353]}}}}}}} {"xx": [2, 2, 3, 9], "yy": [{2:2}], "zz": {"xxx": {"xxxx": {"xxxxx": 10123.33}}}} +6 [[[[[2, 2, 3, 9]]]]] {[2]:{2:{2:2}}} {"x": [2], "y": [2], "z": {1:"example"}} {2:{2:{2:{2:{2:{2:{"x": 2, "y": [2]}}}}}}} {"xx": [2, 2, 3, 9], "yy": [{2:2}, {2:2}, {2:2}, {2:4}, {2:4}, {2:4}, {2:4}, {2:4}, {2:4}, {2:4}], "zz": {"xxx": {"xxxx": {"xxxxx": 10123.33}}}} + +-- !orc_v1_3 -- +6 + +-- !orc_v1_4 -- +1 +1 +2 +1 +1 +1 + +-- !orc_v1_5 -- +[[2]] +[[2]] +[[2]] +[[2]] +[[2]] +[[2]] + +-- !orc_v1_6 -- + +-- !orc_v1_7 -- +6 1 +5 1 + +-- !orc_v2_1 -- +id INT Yes true \N +col2 ARRAY<ARRAY<ARRAY<ARRAY<ARRAY<INT>>>>> Yes true \N +col3 MAP<ARRAY<FLOAT>,MAP<INT,MAP<INT,FLOAT>>> Yes true \N +col4 STRUCT<x:ARRAY<INT>,y:ARRAY<DOUBLE>,z:MAP<BOOLEAN,TEXT>> Yes true \N +col5 MAP<INT,MAP<INT,MAP<INT,MAP<INT,MAP<FLOAT,MAP<DOUBLE,STRUCT<x:INT,y:ARRAY<DOUBLE>>>>>>>> Yes true \N +col6 STRUCT<xx:ARRAY<INT>,yy:ARRAY<MAP<DOUBLE,FLOAT>>,zz:STRUCT<xxx:STRUCT<xxxx:STRUCT<xxxxx:DECIMAL(13, 2)>>>> Yes true \N + +-- !orc_v2_2 -- +1 [[[[[2, 2, 3, 9]]]]] {[2]:{2:{2:2}}} {"x": [2], "y": [2], "z": {1:"example"}} {2:{2:{2:{2:{2:{2:{"x": 2, "y": [2]}}}}}}} {"xx": [2, 2, 3, 9], "yy": [{2:2}], "zz": {"xxx": {"xxxx": {"xxxxx": 10123.33}}}} +2 [[[[[2, 2, 3, 9], [2, 2, 3, 9], [2, 2, 3, 9]]]]] {[2]:{2:{2:2}}} {"x": [2], "y": [2], "z": {1:"example"}} {2:{2:{2:{2:{2:{2:{"x": 2, "y": [2]}}}}}}} {"xx": [2, 2, 3, 9], "yy": [{2:2}], "zz": {"xxx": {"xxxx": {"xxxxx": 10123.33}}}} +3 [[[[[2, 2, 3, 9]], [[2, 2, 3, 9], [2, 2, 3, 9], [2, 2, 3, 9]]], [[[2, 2, 3, 9]], [[2, 2, 3, 9], [2, 2, 3, 9], [2, 2, 3, 9]]]], [[[[2, 2, 3, 9]], [[2, 2, 3, 9], [2, 2, 3, 9], [2, 2, 3, 9]]], [[[2, 2, 3, 9]], [[2, 2, 3, 9], [2, 2, 3, 9], [2, 2, 3, 9]]]]] {[2]:{2:{2:2}}} {"x": [2], "y": [2], "z": {1:"example"}} {2:{2:{2:{2:{2:{2:{"x": 2, "y": [2]}}}}}}} {"xx": [2, 2, 3, 9], "yy": [{2:2}], "zz": {"xxx": {"xxxx": {"xxxxx": 10123.33}}}} +4 [[[[[2, 2, 3, 9]], [[2, 2, 3, 9], [2, 2, 3, 9], [2, 2, 3, 9]]]]] {[2]:{2:{2:222}}} {"x": [2], "y": [2, 2, 22935, 99, 59, 955], "z": {1:"example"}} {2:{2:{2:{2:{2:{2:{"x": 2, "y": [2]}}}}}}} {"xx": [2, 2, 3, 9], "yy": [{2:2}], "zz": {"xxx": {"xxxx": {"xxxxx": 10123.33}}}} +5 [[[[[2, 2, 3, 9]]]]] {[2]:{2:{2:2}}} {"x": [2], "y": [2], "z": {1:"example"}} {2:{2:{2:{2:{2:{2:{"x": 2, "y": [2, 239, 39293259, 2223, 23, 59, 29, 9353]}}}}}}} {"xx": [2, 2, 3, 9], "yy": [{2:2}], "zz": {"xxx": {"xxxx": {"xxxxx": 10123.33}}}} +6 [[[[[2, 2, 3, 9]]]]] {[2]:{2:{2:2}}} {"x": [2], "y": [2], "z": {1:"example"}} {2:{2:{2:{2:{2:{2:{"x": 2, "y": [2]}}}}}}} {"xx": [2, 2, 3, 9], "yy": [{2:2}, {2:2}, {2:2}, {2:4}, {2:4}, {2:4}, {2:4}, {2:4}, {2:4}, {2:4}], "zz": {"xxx": {"xxxx": {"xxxxx": 10123.33}}}} + +-- !orc_v2_3 -- +6 + +-- !orc_v2_4 -- +1 +1 +2 +1 +1 +1 + +-- !orc_v2_5 -- +[[2]] +[[2]] +[[2]] +[[2]] +[[2]] +[[2]] + +-- !orc_v2_6 -- + +-- !orc_v2_7 -- +6 1 +5 1 + diff --git a/regression-test/data/external_table_p2/iceberg/iceberg_position_delete.out b/regression-test/data/external_table_p2/iceberg/iceberg_position_delete.out new file mode 100644 index 00000000000..2d61ebaaf50 --- /dev/null +++ b/regression-test/data/external_table_p2/iceberg/iceberg_position_delete.out @@ -0,0 +1,196 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !gen_data_1 -- + +-- !gen_data_2 -- + +-- !gen_data_3 -- + +-- !gen_data_4 -- + +-- !gen_data_5 -- + +-- !gen_data_6 -- +2 select xxxxxxxxx +2 select xxxxxxxxx +2 select xxxxxxxxx + +-- !gen_data_7 -- +2 +2 +2 + +-- !gen_data_8 -- +2 512 +3 512 +4 512 +6 512 +7 512 +8 512 +9 512 +11 512 +12 512 +13 512 +14 512 + +-- !gen_data_9 -- + +-- !gen_data_10 -- + +-- !gen_data_11 -- + +-- !gen_data_12 -- + +-- !gen_data_13 -- + +-- !gen_data_14 -- +2 select xxxxxxxxx +2 select xxxxxxxxx +2 select xxxxxxxxx + +-- !gen_data_15 -- +7 12345xxx +7 12345xxx +7 12345xxx + +-- !gen_data_16 -- + +-- !gen_data_17 -- + +-- !gen_data_18 -- + +-- !gen_data_19 -- +5632 + +-- !gen_data_20 -- +5632 + +-- !orc_1 -- + +-- !orc_2 -- + +-- !orc_3 -- + +-- !orc_4 -- + +-- !orc_5 -- + +-- !orc_6 -- +2 select xxxxxxxxx +2 select xxxxxxxxx +2 select xxxxxxxxx + +-- !orc_7 -- +2 +2 +2 + +-- !orc_8 -- +2 512 +3 512 +4 512 +6 512 +7 512 +8 512 +9 512 +11 512 +12 512 +13 512 +14 512 + +-- !orc_9 -- + +-- !orc_10 -- + +-- !orc_11 -- + +-- !orc_12 -- + +-- !orc_13 -- + +-- !orc_14 -- +2 select xxxxxxxxx +2 select xxxxxxxxx +2 select xxxxxxxxx + +-- !orc_15 -- +7 12345xxx +7 12345xxx +7 12345xxx + +-- !orc_16 -- + +-- !orc_17 -- + +-- !orc_18 -- + +-- !orc_19 -- +5632 + +-- !orc_20 -- +5632 + +-- !parquet_1 -- + +-- !parquet_2 -- + +-- !parquet_3 -- + +-- !parquet_4 -- + +-- !parquet_5 -- + +-- !parquet_6 -- +2 select xxxxxxxxx +2 select xxxxxxxxx +2 select xxxxxxxxx + +-- !parquet_7 -- +2 +2 +2 + +-- !parquet_8 -- +2 512 +3 512 +4 512 +6 512 +7 512 +8 512 +9 512 +11 512 +12 512 +13 512 +14 512 + +-- !parquet_9 -- + +-- !parquet_10 -- + +-- !parquet_11 -- + +-- !parquet_12 -- + +-- !parquet_13 -- + +-- !parquet_14 -- +2 select xxxxxxxxx +2 select xxxxxxxxx +2 select xxxxxxxxx + +-- !parquet_15 -- +7 12345xxx +7 12345xxx +7 12345xxx + +-- !parquet_16 -- + +-- !parquet_17 -- + +-- !parquet_18 -- + +-- !parquet_19 -- +5632 + +-- !parquet_20 -- +5632 + diff --git a/regression-test/data/external_table_p2/iceberg/iceberg_schema_change.out b/regression-test/data/external_table_p2/iceberg/iceberg_schema_change.out new file mode 100644 index 00000000000..4faf8c695ec --- /dev/null +++ b/regression-test/data/external_table_p2/iceberg/iceberg_schema_change.out @@ -0,0 +1,305 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !parquet_v1_1 -- +rename_col8 BIGINT Yes true \N +rename_col9 DOUBLE Yes true \N +rename_col10 DECIMAL(20, 5) Yes true \N +id INT Yes true \N +rename_col1 ARRAY<BIGINT> Yes true \N +rename_col2 ARRAY<DOUBLE> Yes true \N +rename_col3 ARRAY<DECIMAL(20, 4)> Yes true \N +rename_col4 MAP<INT,BIGINT> Yes true \N +rename_col5 MAP<INT,DOUBLE> Yes true \N +rename_col6 MAP<INT,DECIMAL(20, 5)> Yes true \N +rename_col7 STRUCT<add:DOUBLE,x:BIGINT,y:DOUBLE> Yes true \N +col_add INT Yes true \N +col_add2 INT Yes true \N + +-- !parquet_v1_2 -- +1 1.2000000476837158 1.12345 1 [1, 2, 3] [1.100000023841858, 2.200000047683716, 3.299999952316284] [1.1234, 2.2345, 3.3456] {1:10, 2:20} {1:1.100000023841858, 2:2.200000047683716} {1:1.12345, 2:2.23456} {"add": null, "x": 1, "y": 1.100000023841858} \N \N +1 1.2000000476837158 1.12345 2 [4, 5, 6] [4.400000095367432, 5.5, 6.599999904632568] [4.4567, 5.5678, 6.6789] {3:30, 4:40} {3:3.299999952316284, 4:4.400000095367432} {3:3.34567, 4:4.45678} {"add": null, "x": 2, "y": 2.200000047683716} \N \N +1 1.2000000476837158 1.12345 3 [7, 8, 9] [7.699999809265137, 8.800000190734863, 9.899999618530273] [7.7890, 8.8901, 9.9012] {5:50, 6:60} {5:5.5, 6:6.599999904632568} {5:5.56789, 6:6.67890} {"add": null, "x": 3, "y": 3.299999952316284} \N \N +1 1.2000000476837158 1.12345 4 [10, 11, 12] [10.100000381469727, 11.109999656677246, 12.119999885559082] [10.1011, 11.1112, 12.1213] {7:70, 8:80} {7:7.699999809265137, 8:8.800000190734863} {7:7.78901, 8:8.89012} {"add": null, "x": 4, "y": 4.400000095367432} \N \N +1 1.2000000476837158 1.12345 5 [13, 14, 15] [13.130000114440918, 14.140000343322754, 15.149999618530273] [13.1314, 14.1415, 15.1516] {9:90, 10:100} {9:9.899999618530273, 10:10.100000381469727} {9:9.89012, 10:10.10123} {"add": null, "x": 5, "y": 5.5} \N \N +21447483648 1.7976931348623157E308 1234567890.12345 6 [21447483648, 21474483649, 21474483650] [1.7976931348623157e+308, 1.7976931348623157e+308, 1.7976931348623157e+308] [1234567890.1235, 1234567890.2346, 1234567890.3457] {214748348:2147483648, 24748649:214743383649} {214748648:1.7976931348623157e+308, 27483649:1.7976931348623157e+308} {214743648:1234567890.12345, 21474649:1234567890.23456} {"add": 1234567890.12345, "x": 214748223648, "y": 1.7976931346232156e+308} 1 2 +2144748345648 1.7976931348623157E308 1234567890.23456 7 [2144748345648, 214742435483649, 214742435483650] [1.7976931348623157e+308, 1.7976931348623157e+308, 1.7976931348623157e+308] [12345673890.1235, 12345367890.2346, 12344567890.3457] {214748348:2147483632148, 24748649:213144743383649} {214748648:1.717693623157e+308, 27483649:1.7976931348623157e+308} {214743648:1234567890.12345, 21474649:1234567890.23456} {"add": 1234567890.12345, "x": 214743338223648, "y": 1.7976931346232156e+308} 2 3 +21447483648 1.7976931348623157E308 1234567890.12345 8 [21447483648, 21474483649, 21474483650] [1.7976931348623157e+308, 1.7976931348623157e+308, 1.7976931348623157e+308] [1234567890.1235, 1234567890.2346, 1234567890.3457] {214748348:2147483648, 24748649:214743383649} {214748648:1.7976931348623157e+308, 27483649:1.7976931348623157e+308} {214743648:1234567890.12345, 21474649:1234567890.23456} {"add": 1234567890.12345, "x": 214748223648, "y": 1.7976931346232156e+308} 3 4 +2144748345648 1.7976931348623157E308 1234567890.23456 9 [2144748345648, 214742435483649, 214742435483650, 214742435483650, 214742435483650, 214742435483650] [1.7976931348623157e+308, 1.7976931348623157e+308, 1.7976931348623157e+308] [12345673890.1235, 12345367890.2346, 12344567890.3457] {214748348:2147483632148, 24748649:213144743383649} {214748648:1.717693623157e+308, 27483649:1.7976931348623157e+308} {214743648:1234567890.12345, 21474649:1234567890.23456} {"add": 1234567890.12345, "x": [...] + +-- !parquet_v1_3 -- +9 + +-- !parquet_v1_4 -- +6 +7 +8 +9 + +-- !parquet_v1_5 -- +1 +2 +3 +4 + +-- !parquet_v1_6 -- +6 +7 +8 +9 + +-- !parquet_v1_7 -- +{"add": null, "x": 1, "y": 1.100000023841858} +{"add": null, "x": 2, "y": 2.200000047683716} +{"add": null, "x": 3, "y": 3.299999952316284} +{"add": null, "x": 4, "y": 4.400000095367432} +{"add": null, "x": 5, "y": 5.5} +{"add": 1234567890.12345, "x": 214748223648, "y": 1.7976931346232156e+308} +{"add": 1234567890.12345, "x": 214743338223648, "y": 1.7976931346232156e+308} +{"add": 1234567890.12345, "x": 214748223648, "y": 1.7976931346232156e+308} +{"add": 1234567890.12345, "x": 214743338223648, "y": 1.7976931346232156e+308} + +-- !parquet_v1_8 -- +3 +4 +5 + +-- !parquet_v1_9 -- +9 1 +8 1 +7 1 +6 1 +5 0 +4 0 +3 0 +2 0 +1 0 + +-- !parquet_v1_10 -- + +-- !parquet_v2_1 -- +rename_col8 BIGINT Yes true \N +rename_col9 DOUBLE Yes true \N +rename_col10 DECIMAL(20, 5) Yes true \N +id INT Yes true \N +rename_col1 ARRAY<BIGINT> Yes true \N +rename_col2 ARRAY<DOUBLE> Yes true \N +rename_col3 ARRAY<DECIMAL(20, 4)> Yes true \N +rename_col4 MAP<INT,BIGINT> Yes true \N +rename_col5 MAP<INT,DOUBLE> Yes true \N +rename_col6 MAP<INT,DECIMAL(20, 5)> Yes true \N +rename_col7 STRUCT<add:DOUBLE,x:BIGINT,y:DOUBLE> Yes true \N +col_add INT Yes true \N +col_add2 INT Yes true \N + +-- !parquet_v2_2 -- +1 1.2000000476837158 1.12345 1 [1, 2, 3] [1.100000023841858, 2.200000047683716, 3.299999952316284] [1.1234, 2.2345, 3.3456] {1:10, 2:20} {1:1.100000023841858, 2:2.200000047683716} {1:1.12345, 2:2.23456} {"add": null, "x": 1, "y": 1.100000023841858} \N \N +1 1.2000000476837158 1.12345 2 [4, 5, 6] [4.400000095367432, 5.5, 6.599999904632568] [4.4567, 5.5678, 6.6789] {3:30, 4:40} {3:3.299999952316284, 4:4.400000095367432} {3:3.34567, 4:4.45678} {"add": null, "x": 2, "y": 2.200000047683716} \N \N +1 1.2000000476837158 1.12345 3 [7, 8, 9] [7.699999809265137, 8.800000190734863, 9.899999618530273] [7.7890, 8.8901, 9.9012] {5:50, 6:60} {5:5.5, 6:6.599999904632568} {5:5.56789, 6:6.67890} {"add": null, "x": 3, "y": 3.299999952316284} \N \N +1 1.2000000476837158 1.12345 4 [10, 11, 12] [10.100000381469727, 11.109999656677246, 12.119999885559082] [10.1011, 11.1112, 12.1213] {7:70, 8:80} {7:7.699999809265137, 8:8.800000190734863} {7:7.78901, 8:8.89012} {"add": null, "x": 4, "y": 4.400000095367432} \N \N +1 1.2000000476837158 1.12345 5 [13, 14, 15] [13.130000114440918, 14.140000343322754, 15.149999618530273] [13.1314, 14.1415, 15.1516] {9:90, 10:100} {9:9.899999618530273, 10:10.100000381469727} {9:9.89012, 10:10.10123} {"add": null, "x": 5, "y": 5.5} \N \N +21447483648 1.7976931348623157E308 1234567890.12345 6 [21447483648, 21474483649, 21474483650] [1.7976931348623157e+308, 1.7976931348623157e+308, 1.7976931348623157e+308] [1234567890.1235, 1234567890.2346, 1234567890.3457] {214748348:2147483648, 24748649:214743383649} {214748648:1.7976931348623157e+308, 27483649:1.7976931348623157e+308} {214743648:1234567890.12345, 21474649:1234567890.23456} {"add": 1234567890.12345, "x": 214748223648, "y": 1.7976931346232156e+308} 1 2 +2144748345648 1.7976931348623157E308 1234567890.23456 7 [2144748345648, 214742435483649, 214742435483650] [1.7976931348623157e+308, 1.7976931348623157e+308, 1.7976931348623157e+308] [12345673890.1235, 12345367890.2346, 12344567890.3457] {214748348:2147483632148, 24748649:213144743383649} {214748648:1.717693623157e+308, 27483649:1.7976931348623157e+308} {214743648:1234567890.12345, 21474649:1234567890.23456} {"add": 1234567890.12345, "x": 214743338223648, "y": 1.7976931346232156e+308} 2 3 +21447483648 1.7976931348623157E308 1234567890.12345 8 [21447483648, 21474483649, 21474483650] [1.7976931348623157e+308, 1.7976931348623157e+308, 1.7976931348623157e+308] [1234567890.1235, 1234567890.2346, 1234567890.3457] {214748348:2147483648, 24748649:214743383649} {214748648:1.7976931348623157e+308, 27483649:1.7976931348623157e+308} {214743648:1234567890.12345, 21474649:1234567890.23456} {"add": 1234567890.12345, "x": 214748223648, "y": 1.7976931346232156e+308} 3 4 +2144748345648 1.7976931348623157E308 1234567890.23456 9 [2144748345648, 214742435483649, 214742435483650, 214742435483650, 214742435483650, 214742435483650] [1.7976931348623157e+308, 1.7976931348623157e+308, 1.7976931348623157e+308] [12345673890.1235, 12345367890.2346, 12344567890.3457] {214748348:2147483632148, 24748649:213144743383649} {214748648:1.717693623157e+308, 27483649:1.7976931348623157e+308} {214743648:1234567890.12345, 21474649:1234567890.23456} {"add": 1234567890.12345, "x": [...] + +-- !parquet_v2_3 -- +9 + +-- !parquet_v2_4 -- +6 +7 +8 +9 + +-- !parquet_v2_5 -- +1 +2 +3 +4 + +-- !parquet_v2_6 -- +6 +7 +8 +9 + +-- !parquet_v2_7 -- +{"add": null, "x": 1, "y": 1.100000023841858} +{"add": null, "x": 2, "y": 2.200000047683716} +{"add": null, "x": 3, "y": 3.299999952316284} +{"add": null, "x": 4, "y": 4.400000095367432} +{"add": null, "x": 5, "y": 5.5} +{"add": 1234567890.12345, "x": 214748223648, "y": 1.7976931346232156e+308} +{"add": 1234567890.12345, "x": 214743338223648, "y": 1.7976931346232156e+308} +{"add": 1234567890.12345, "x": 214748223648, "y": 1.7976931346232156e+308} +{"add": 1234567890.12345, "x": 214743338223648, "y": 1.7976931346232156e+308} + +-- !parquet_v2_8 -- +3 +4 +5 + +-- !parquet_v2_9 -- +9 1 +8 1 +7 1 +6 1 +5 0 +4 0 +3 0 +2 0 +1 0 + +-- !parquet_v2_10 -- + +-- !orc_v1_1 -- +rename_col8 BIGINT Yes true \N +rename_col9 DOUBLE Yes true \N +rename_col10 DECIMAL(20, 5) Yes true \N +id INT Yes true \N +rename_col1 ARRAY<BIGINT> Yes true \N +rename_col2 ARRAY<DOUBLE> Yes true \N +rename_col3 ARRAY<DECIMAL(20, 4)> Yes true \N +rename_col4 MAP<INT,BIGINT> Yes true \N +rename_col5 MAP<INT,DOUBLE> Yes true \N +rename_col6 MAP<INT,DECIMAL(20, 5)> Yes true \N +rename_col7 STRUCT<add:DOUBLE,x:BIGINT,y:DOUBLE> Yes true \N +col_add INT Yes true \N +col_add2 INT Yes true \N + +-- !orc_v1_2 -- +1 1.2000000476837158 1.12345 1 [1, 2, 3] [1.100000023841858, 2.200000047683716, 3.299999952316284] [1.1234, 2.2345, 3.3456] {1:10, 2:20} {1:1.100000023841858, 2:2.200000047683716} {1:1.12345, 2:2.23456} {"add": null, "x": 1, "y": 1.100000023841858} \N \N +1 1.2000000476837158 1.12345 2 [4, 5, 6] [4.400000095367432, 5.5, 6.599999904632568] [4.4567, 5.5678, 6.6789] {3:30, 4:40} {3:3.299999952316284, 4:4.400000095367432} {3:3.34567, 4:4.45678} {"add": null, "x": 2, "y": 2.200000047683716} \N \N +1 1.2000000476837158 1.12345 3 [7, 8, 9] [7.699999809265137, 8.800000190734863, 9.899999618530273] [7.7890, 8.8901, 9.9012] {5:50, 6:60} {5:5.5, 6:6.599999904632568} {5:5.56789, 6:6.67890} {"add": null, "x": 3, "y": 3.299999952316284} \N \N +1 1.2000000476837158 1.12345 4 [10, 11, 12] [10.100000381469727, 11.109999656677246, 12.119999885559082] [10.1011, 11.1112, 12.1213] {7:70, 8:80} {7:7.699999809265137, 8:8.800000190734863} {7:7.78901, 8:8.89012} {"add": null, "x": 4, "y": 4.400000095367432} \N \N +1 1.2000000476837158 1.12345 5 [13, 14, 15] [13.130000114440918, 14.140000343322754, 15.149999618530273] [13.1314, 14.1415, 15.1516] {9:90, 10:100} {9:9.899999618530273, 10:10.100000381469727} {9:9.89012, 10:10.10123} {"add": null, "x": 5, "y": 5.5} \N \N +21447483648 1.7976931348623157E308 1234567890.12345 6 [21447483648, 21474483649, 21474483650] [1.7976931348623157e+308, 1.7976931348623157e+308, 1.7976931348623157e+308] [1234567890.1235, 1234567890.2346, 1234567890.3457] {214748348:2147483648, 24748649:214743383649} {214748648:1.7976931348623157e+308, 27483649:1.7976931348623157e+308} {214743648:1234567890.12345, 21474649:1234567890.23456} {"add": 1234567890.12345, "x": 214748223648, "y": 1.7976931346232156e+308} 1 2 +2144748345648 1.7976931348623157E308 1234567890.23456 7 [2144748345648, 214742435483649, 214742435483650] [1.7976931348623157e+308, 1.7976931348623157e+308, 1.7976931348623157e+308] [12345673890.1235, 12345367890.2346, 12344567890.3457] {214748348:2147483632148, 24748649:213144743383649} {214748648:1.717693623157e+308, 27483649:1.7976931348623157e+308} {214743648:1234567890.12345, 21474649:1234567890.23456} {"add": 1234567890.12345, "x": 214743338223648, "y": 1.7976931346232156e+308} 2 3 +21447483648 1.7976931348623157E308 1234567890.12345 8 [21447483648, 21474483649, 21474483650] [1.7976931348623157e+308, 1.7976931348623157e+308, 1.7976931348623157e+308] [1234567890.1235, 1234567890.2346, 1234567890.3457] {214748348:2147483648, 24748649:214743383649} {214748648:1.7976931348623157e+308, 27483649:1.7976931348623157e+308} {214743648:1234567890.12345, 21474649:1234567890.23456} {"add": 1234567890.12345, "x": 214748223648, "y": 1.7976931346232156e+308} 3 4 +2144748345648 1.7976931348623157E308 1234567890.23456 9 [2144748345648, 214742435483649, 214742435483650, 214742435483650, 214742435483650, 214742435483650] [1.7976931348623157e+308, 1.7976931348623157e+308, 1.7976931348623157e+308] [12345673890.1235, 12345367890.2346, 12344567890.3457] {214748348:2147483632148, 24748649:213144743383649} {214748648:1.717693623157e+308, 27483649:1.7976931348623157e+308} {214743648:1234567890.12345, 21474649:1234567890.23456} {"add": 1234567890.12345, "x": [...] + +-- !orc_v1_3 -- +9 + +-- !orc_v1_4 -- +6 +7 +8 +9 + +-- !orc_v1_5 -- +1 +2 +3 +4 + +-- !orc_v1_6 -- +6 +7 +8 +9 + +-- !orc_v1_7 -- +{"add": null, "x": 1, "y": 1.100000023841858} +{"add": null, "x": 2, "y": 2.200000047683716} +{"add": null, "x": 3, "y": 3.299999952316284} +{"add": null, "x": 4, "y": 4.400000095367432} +{"add": null, "x": 5, "y": 5.5} +{"add": 1234567890.12345, "x": 214748223648, "y": 1.7976931346232156e+308} +{"add": 1234567890.12345, "x": 214743338223648, "y": 1.7976931346232156e+308} +{"add": 1234567890.12345, "x": 214748223648, "y": 1.7976931346232156e+308} +{"add": 1234567890.12345, "x": 214743338223648, "y": 1.7976931346232156e+308} + +-- !orc_v1_8 -- +3 +4 +5 + +-- !orc_v1_9 -- +9 1 +8 1 +7 1 +6 1 +5 0 +4 0 +3 0 +2 0 +1 0 + +-- !orc_v1_10 -- + +-- !orc_v2_1 -- +rename_col8 BIGINT Yes true \N +rename_col9 DOUBLE Yes true \N +rename_col10 DECIMAL(20, 5) Yes true \N +id INT Yes true \N +rename_col1 ARRAY<BIGINT> Yes true \N +rename_col2 ARRAY<DOUBLE> Yes true \N +rename_col3 ARRAY<DECIMAL(20, 4)> Yes true \N +rename_col4 MAP<INT,BIGINT> Yes true \N +rename_col5 MAP<INT,DOUBLE> Yes true \N +rename_col6 MAP<INT,DECIMAL(20, 5)> Yes true \N +rename_col7 STRUCT<add:DOUBLE,x:BIGINT,y:DOUBLE> Yes true \N +col_add INT Yes true \N +col_add2 INT Yes true \N + +-- !orc_v2_2 -- +1 1.2000000476837158 1.12345 1 [1, 2, 3] [1.100000023841858, 2.200000047683716, 3.299999952316284] [1.1234, 2.2345, 3.3456] {1:10, 2:20} {1:1.100000023841858, 2:2.200000047683716} {1:1.12345, 2:2.23456} {"add": null, "x": 1, "y": 1.100000023841858} \N \N +1 1.2000000476837158 1.12345 2 [4, 5, 6] [4.400000095367432, 5.5, 6.599999904632568] [4.4567, 5.5678, 6.6789] {3:30, 4:40} {3:3.299999952316284, 4:4.400000095367432} {3:3.34567, 4:4.45678} {"add": null, "x": 2, "y": 2.200000047683716} \N \N +1 1.2000000476837158 1.12345 3 [7, 8, 9] [7.699999809265137, 8.800000190734863, 9.899999618530273] [7.7890, 8.8901, 9.9012] {5:50, 6:60} {5:5.5, 6:6.599999904632568} {5:5.56789, 6:6.67890} {"add": null, "x": 3, "y": 3.299999952316284} \N \N +1 1.2000000476837158 1.12345 4 [10, 11, 12] [10.100000381469727, 11.109999656677246, 12.119999885559082] [10.1011, 11.1112, 12.1213] {7:70, 8:80} {7:7.699999809265137, 8:8.800000190734863} {7:7.78901, 8:8.89012} {"add": null, "x": 4, "y": 4.400000095367432} \N \N +1 1.2000000476837158 1.12345 5 [13, 14, 15] [13.130000114440918, 14.140000343322754, 15.149999618530273] [13.1314, 14.1415, 15.1516] {9:90, 10:100} {9:9.899999618530273, 10:10.100000381469727} {9:9.89012, 10:10.10123} {"add": null, "x": 5, "y": 5.5} \N \N +21447483648 1.7976931348623157E308 1234567890.12345 6 [21447483648, 21474483649, 21474483650] [1.7976931348623157e+308, 1.7976931348623157e+308, 1.7976931348623157e+308] [1234567890.1235, 1234567890.2346, 1234567890.3457] {214748348:2147483648, 24748649:214743383649} {214748648:1.7976931348623157e+308, 27483649:1.7976931348623157e+308} {214743648:1234567890.12345, 21474649:1234567890.23456} {"add": 1234567890.12345, "x": 214748223648, "y": 1.7976931346232156e+308} 1 2 +2144748345648 1.7976931348623157E308 1234567890.23456 7 [2144748345648, 214742435483649, 214742435483650] [1.7976931348623157e+308, 1.7976931348623157e+308, 1.7976931348623157e+308] [12345673890.1235, 12345367890.2346, 12344567890.3457] {214748348:2147483632148, 24748649:213144743383649} {214748648:1.717693623157e+308, 27483649:1.7976931348623157e+308} {214743648:1234567890.12345, 21474649:1234567890.23456} {"add": 1234567890.12345, "x": 214743338223648, "y": 1.7976931346232156e+308} 2 3 +21447483648 1.7976931348623157E308 1234567890.12345 8 [21447483648, 21474483649, 21474483650] [1.7976931348623157e+308, 1.7976931348623157e+308, 1.7976931348623157e+308] [1234567890.1235, 1234567890.2346, 1234567890.3457] {214748348:2147483648, 24748649:214743383649} {214748648:1.7976931348623157e+308, 27483649:1.7976931348623157e+308} {214743648:1234567890.12345, 21474649:1234567890.23456} {"add": 1234567890.12345, "x": 214748223648, "y": 1.7976931346232156e+308} 3 4 +2144748345648 1.7976931348623157E308 1234567890.23456 9 [2144748345648, 214742435483649, 214742435483650, 214742435483650, 214742435483650, 214742435483650] [1.7976931348623157e+308, 1.7976931348623157e+308, 1.7976931348623157e+308] [12345673890.1235, 12345367890.2346, 12344567890.3457] {214748348:2147483632148, 24748649:213144743383649} {214748648:1.717693623157e+308, 27483649:1.7976931348623157e+308} {214743648:1234567890.12345, 21474649:1234567890.23456} {"add": 1234567890.12345, "x": [...] + +-- !orc_v2_3 -- +9 + +-- !orc_v2_4 -- +6 +7 +8 +9 + +-- !orc_v2_5 -- +1 +2 +3 +4 + +-- !orc_v2_6 -- +6 +7 +8 +9 + +-- !orc_v2_7 -- +{"add": null, "x": 1, "y": 1.100000023841858} +{"add": null, "x": 2, "y": 2.200000047683716} +{"add": null, "x": 3, "y": 3.299999952316284} +{"add": null, "x": 4, "y": 4.400000095367432} +{"add": null, "x": 5, "y": 5.5} +{"add": 1234567890.12345, "x": 214748223648, "y": 1.7976931346232156e+308} +{"add": 1234567890.12345, "x": 214743338223648, "y": 1.7976931346232156e+308} +{"add": 1234567890.12345, "x": 214748223648, "y": 1.7976931346232156e+308} +{"add": 1234567890.12345, "x": 214743338223648, "y": 1.7976931346232156e+308} + +-- !orc_v2_8 -- +3 +4 +5 + +-- !orc_v2_9 -- +9 1 +8 1 +7 1 +6 1 +5 0 +4 0 +3 0 +2 0 +1 0 + +-- !orc_v2_10 -- + diff --git a/regression-test/suites/external_table_p2/iceberg/iceberg_complex_type.groovy b/regression-test/suites/external_table_p2/iceberg/iceberg_complex_type.groovy new file mode 100644 index 00000000000..f465a9afe37 --- /dev/null +++ b/regression-test/suites/external_table_p2/iceberg/iceberg_complex_type.groovy @@ -0,0 +1,92 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("iceberg_complex_type", "p2,external,iceberg,external_remote,external_remote_iceberg") { + + String enabled = context.config.otherConfigs.get("enableExternalHiveTest") + if (enabled != null && enabled.equalsIgnoreCase("true")) { + + String catalog_name = "test_external_iceberg_complex_type" + String extHiveHmsHost = context.config.otherConfigs.get("extHiveHmsHost") + String extHdfsPort = context.config.otherConfigs.get("extHdfsPort") + sql """drop catalog if exists ${catalog_name};""" + sql """ + create catalog if not exists ${catalog_name} properties ( + 'type'='iceberg', + 'iceberg.catalog.type'='hadoop', + 'warehouse' = 'hdfs://${extHiveHmsHost}:${extHdfsPort}/usr/hive/warehouse/hadoop_catalog' + ); + """ + + logger.info("catalog " + catalog_name + " created") + sql """switch ${catalog_name};""" + logger.info("switched to catalog " + catalog_name) + sql """ use multi_catalog;""" + + + + qt_parquet_v1_1 """ desc complex_parquet_v1 ;""" + qt_parquet_v1_2 """ select * from complex_parquet_v1 order by id; """ + qt_parquet_v1_3 """ select count(*) from complex_parquet_v1 ;""" + qt_parquet_v1_4 """ select array_size(col2) from complex_parquet_v1 where col2 is not null order by id ; """ + qt_parquet_v1_5 """ select map_keys(col3) from complex_parquet_v1 order by id; """ + qt_parquet_v1_6 """ select struct_element(col4, 1) from complex_parquet_v1 where id >=7 order by id; """ + qt_parquet_v1_7 """ select id,count(col2) from complex_parquet_v1 group by id order by id desc limit 2; """ + + + qt_parquet_v2_1 """ desc complex_parquet_v2 ;""" + qt_parquet_v2_2 """ select * from complex_parquet_v2 order by id; """ + qt_parquet_v2_3 """ select count(*) from complex_parquet_v2 ;""" + qt_parquet_v2_4 """ select array_size(col2) from complex_parquet_v2 where col2 is not null order by id ; """ + qt_parquet_v2_5 """ select map_keys(col3) from complex_parquet_v2 order by id; """ + qt_parquet_v2_6 """ select struct_element(col4, 1) from complex_parquet_v2 where id >=7 order by id; """ + qt_parquet_v2_7 """ select id,count(col2) from complex_parquet_v2 group by id order by id desc limit 2; """ + + + qt_orc_v1_1 """ desc complex_orc_v1 ;""" + qt_orc_v1_2 """ select * from complex_orc_v1 order by id; """ + qt_orc_v1_3 """ select count(*) from complex_orc_v1 ;""" + qt_orc_v1_4 """ select array_size(col2) from complex_orc_v1 where col2 is not null order by id ; """ + qt_orc_v1_5 """ select map_keys(col3) from complex_orc_v1 order by id; """ + qt_orc_v1_6 """ select struct_element(col4, 1) from complex_orc_v1 where id >=7 order by id; """ + qt_orc_v1_7 """ select id,count(col2) from complex_orc_v1 group by id order by id desc limit 2; """ + + + qt_orc_v2_1 """ desc complex_orc_v2 ;""" + qt_orc_v2_2 """ select * from complex_orc_v2 order by id; """ + qt_orc_v2_3 """ select count(*) from complex_orc_v2 ;""" + qt_orc_v2_4 """ select array_size(col2) from complex_orc_v2 where col2 is not null order by id ; """ + qt_orc_v2_5 """ select map_keys(col3) from complex_orc_v2 order by id; """ + qt_orc_v2_6 """ select struct_element(col4, 1) from complex_orc_v2 where id >=7 order by id; """ + qt_orc_v2_7 """ select id,count(col2) from complex_orc_v2 group by id order by id desc limit 2; """ + + + + + } +} + +/* +schema : + id int + col2 array<array<array<array<array<int>>>>> + col3 map<array<float>,map<int,map<int,float>>> + col4 struct<x:array<int>,y:array<double>,z:map<boolean,string>> + col5 map<int,map<int,map<int,map<int,map<float,map<double,struct<x:int,y:array<double>>>>>>>> + col6 struct<xx:array<int>,yy:array<map<double,float>>,zz:struct<xxx:struct<xxxx:struct<xxxxx:decimal(13,2)>>>> + +*/ \ No newline at end of file diff --git a/regression-test/suites/external_table_p2/iceberg/iceberg_position_delete.groovy b/regression-test/suites/external_table_p2/iceberg/iceberg_position_delete.groovy new file mode 100644 index 00000000000..4cb497c3078 --- /dev/null +++ b/regression-test/suites/external_table_p2/iceberg/iceberg_position_delete.groovy @@ -0,0 +1,195 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("iceberg_position_delete", "p2,external,iceberg,external_remote,external_remote_iceberg") { + + String enabled = context.config.otherConfigs.get("enableExternalHiveTest") + if (enabled != null && enabled.equalsIgnoreCase("true")) { + + String catalog_name = "test_external_iceberg_position_delete" + String extHiveHmsHost = context.config.otherConfigs.get("extHiveHmsHost") + String extHdfsPort = context.config.otherConfigs.get("extHdfsPort") + sql """drop catalog if exists ${catalog_name};""" + sql """ + create catalog if not exists ${catalog_name} properties ( + 'type'='iceberg', + 'iceberg.catalog.type'='hadoop', + 'warehouse' = 'hdfs://${extHiveHmsHost}:${extHdfsPort}/usr/hive/warehouse/hadoop_catalog' + ); + """ + + logger.info("catalog " + catalog_name + " created") + sql """switch ${catalog_name};""" + logger.info("switched to catalog " + catalog_name) + sql """ use multi_catalog;""" + + qt_gen_data_1 """ select * from iceberg_position_gen_data where name = 'xyzxxxxxx' and id != 9;""" + qt_gen_data_2 """ select * from iceberg_position_gen_data where id = 1; """ + qt_gen_data_3 """ select * from iceberg_position_gen_data where id = 5; """ + qt_gen_data_4 """ select * from iceberg_position_gen_data where id = 10; """ + qt_gen_data_5 """ select * from iceberg_position_gen_data where id = 15; """ + qt_gen_data_6 """ select * from iceberg_position_gen_data where id = 2 limit 3;""" + qt_gen_data_7 """ select id from iceberg_position_gen_data where id = 2 limit 3;""" + qt_gen_data_8 """ select id,count(name) from iceberg_position_gen_data where id != 1 group by id order by id ;""" + qt_gen_data_9 """ select id from iceberg_position_gen_data where id = 1; """ + qt_gen_data_10 """ select name from iceberg_position_gen_data where id = 5; """ + qt_gen_data_11 """ select id from iceberg_position_gen_data where id = 10; """ + qt_gen_data_12 """ select name from iceberg_position_gen_data where id = 15;""" + qt_gen_data_13 """ select * from iceberg_position_gen_data where id = 15 and name = 'select xxxxxxxxx';""" + qt_gen_data_14 """ select * from iceberg_position_gen_data where id = 2 and name = 'select xxxxxxxxx' limit 3;""" + qt_gen_data_15 """ select * from iceberg_position_gen_data where id = 7 and name = '12345xxx' limit 3;""" + qt_gen_data_16 """ select * from iceberg_position_gen_data where name = 'hello world' ;""" + qt_gen_data_17 """ select name from iceberg_position_gen_data where name = 'hello world' ;""" + qt_gen_data_18 """ select id from iceberg_position_gen_data where name = 'hello world' ;""" + qt_gen_data_19 """ select count(*) from iceberg_position_gen_data where name != 'final entryxxxxxx' ;""" + qt_gen_data_20 """ select count(*) from iceberg_position_gen_data; """ + + + qt_orc_1 """ select * from iceberg_position_orc where name = 'xyzxxxxxx' and id != 9;""" + qt_orc_2 """ select * from iceberg_position_orc where id = 1; """ + qt_orc_3 """ select * from iceberg_position_orc where id = 5; """ + qt_orc_4 """ select * from iceberg_position_orc where id = 10; """ + qt_orc_5 """ select * from iceberg_position_orc where id = 15; """ + qt_orc_6 """ select * from iceberg_position_orc where id = 2 limit 3;""" + qt_orc_7 """ select id from iceberg_position_orc where id = 2 limit 3;""" + qt_orc_8 """ select id,count(name) from iceberg_position_orc where id != 1 group by id order by id ;""" + qt_orc_9 """ select id from iceberg_position_orc where id = 1; """ + qt_orc_10 """ select name from iceberg_position_orc where id = 5; """ + qt_orc_11 """ select id from iceberg_position_orc where id = 10; """ + qt_orc_12 """ select name from iceberg_position_orc where id = 15;""" + qt_orc_13 """ select * from iceberg_position_orc where id = 15 and name = 'select xxxxxxxxx';""" + qt_orc_14 """ select * from iceberg_position_orc where id = 2 and name = 'select xxxxxxxxx' limit 3;""" + qt_orc_15 """ select * from iceberg_position_orc where id = 7 and name = '12345xxx' limit 3;""" + qt_orc_16 """ select * from iceberg_position_orc where name = 'hello world' ;""" + qt_orc_17 """ select name from iceberg_position_orc where name = 'hello world' ;""" + qt_orc_18 """ select id from iceberg_position_orc where name = 'hello world' ;""" + qt_orc_19 """ select count(*) from iceberg_position_orc where name != 'final entryxxxxxx' ;""" + qt_orc_20 """ select count(*) from iceberg_position_orc; """ + + qt_parquet_1 """ select * from iceberg_position_parquet where name = 'xyzxxxxxx' and id != 9;""" + qt_parquet_2 """ select * from iceberg_position_parquet where id = 1; """ + qt_parquet_3 """ select * from iceberg_position_parquet where id = 5; """ + qt_parquet_4 """ select * from iceberg_position_parquet where id = 10; """ + qt_parquet_5 """ select * from iceberg_position_parquet where id = 15; """ + qt_parquet_6 """ select * from iceberg_position_parquet where id = 2 limit 3;""" + qt_parquet_7 """ select id from iceberg_position_parquet where id = 2 limit 3;""" + qt_parquet_8 """ select id,count(name) from iceberg_position_parquet where id != 1 group by id order by id ;""" + qt_parquet_9 """ select id from iceberg_position_parquet where id = 1; """ + qt_parquet_10 """ select name from iceberg_position_parquet where id = 5; """ + qt_parquet_11 """ select id from iceberg_position_parquet where id = 10; """ + qt_parquet_12 """ select name from iceberg_position_parquet where id = 15;""" + qt_parquet_13 """ select * from iceberg_position_parquet where id = 15 and name = 'select xxxxxxxxx';""" + qt_parquet_14 """ select * from iceberg_position_parquet where id = 2 and name = 'select xxxxxxxxx' limit 3;""" + qt_parquet_15 """ select * from iceberg_position_parquet where id = 7 and name = '12345xxx' limit 3;""" + qt_parquet_16 """ select * from iceberg_position_parquet where name = 'hello world' ;""" + qt_parquet_17 """ select name from iceberg_position_parquet where name = 'hello world' ;""" + qt_parquet_18 """ select id from iceberg_position_parquet where name = 'hello world' ;""" + qt_parquet_19 """ select count(*) from iceberg_position_parquet where name != 'final entryxxxxxx' ;""" + qt_parquet_20 """ select count(*) from iceberg_position_parquet; """ + + } +} +/* + + +create table iceberg_position_gen_data( + id int, + name string +) +USING iceberg +TBLPROPERTIES ( + 'format-version' = '2', + 'write.format.default' = 'orc', + 'write.update.mode' = 'merge-on-read', + 'write.merge.mode' = 'merge-on-read', + 'write.delete.mode' = 'merge-on-read' +); + +INSERT INTO iceberg_position_gen_data VALUES +(1, "hello world"), +(2, "select xxxxxxxxx"), +(3, "example xxxx"), +(4, "more dataxxx"), +(5, "another examplexxx"), +(6, "testxxx"), +(7, "12345xxx"), +(8, "abcdefxxxx"), +(9, "xyzxxxxxx"), +(10, "inserted dataxxxxx"), +(11, "SQLxxxxx"), +(12, "tablexxxx"), +(13, "rowxxxx"), +(14, "data entryxxxx"), +(15, "final entryxxxxxx"); +insert into iceberg_position_gen_data select * from iceberg_position_gen_data; +insert into iceberg_position_gen_data select * from iceberg_position_gen_data; +insert into iceberg_position_gen_data select * from iceberg_position_gen_data; +insert into iceberg_position_gen_data select * from iceberg_position_gen_data; +insert into iceberg_position_gen_data select * from iceberg_position_gen_data; +insert into iceberg_position_gen_data select * from iceberg_position_gen_data; +insert into iceberg_position_gen_data select * from iceberg_position_gen_data; +insert into iceberg_position_gen_data select * from iceberg_position_gen_data; +insert into iceberg_position_gen_data select * from iceberg_position_gen_data; + + + +create table iceberg_position_parquet( + id int, + name string +) +USING iceberg +TBLPROPERTIES ( + 'format-version' = '2', + 'write.format.default' = 'parquet', + 'write.update.mode' = 'merge-on-read', + 'write.merge.mode' = 'merge-on-read', + 'write.delete.mode' = 'merge-on-read' +); +create table iceberg_position_orc( + id int, + name string +) +USING iceberg +TBLPROPERTIES ( + 'format-version' = '2', + 'write.format.default' = 'orc', + 'write.update.mode' = 'merge-on-read', + 'write.merge.mode' = 'merge-on-read', + 'write.delete.mode' = 'merge-on-read' +); + +insert into iceberg_position_parquet select * from iceberg_position_gen_data; +insert into iceberg_position_orc select * from iceberg_position_parquet; + + +delete from iceberg_position_gen_data where id = 1; +delete from iceberg_position_gen_data where id = 5; +delete from iceberg_position_gen_data where id = 10; +delete from iceberg_position_gen_data where id = 15; + +delete from iceberg_position_parquet where id = 1; +delete from iceberg_position_parquet where id = 5; +delete from iceberg_position_parquet where id = 10; +delete from iceberg_position_parquet where id = 15; + +delete from iceberg_position_orc where id = 1; +delete from iceberg_position_orc where id = 5; +delete from iceberg_position_orc where id = 10; +delete from iceberg_position_orc where id = 15; +*/ + + diff --git a/regression-test/suites/external_table_p2/iceberg/iceberg_schema_change.groovy b/regression-test/suites/external_table_p2/iceberg/iceberg_schema_change.groovy new file mode 100644 index 00000000000..5e036683595 --- /dev/null +++ b/regression-test/suites/external_table_p2/iceberg/iceberg_schema_change.groovy @@ -0,0 +1,162 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("iceberg_schema_change", "p2,external,iceberg,external_remote,external_remote_iceberg") { + + String enabled = context.config.otherConfigs.get("enableExternalHiveTest") + if (enabled != null && enabled.equalsIgnoreCase("true")) { + + String catalog_name = "test_external_iceberg_schema_change" + String extHiveHmsHost = context.config.otherConfigs.get("extHiveHmsHost") + String extHdfsPort = context.config.otherConfigs.get("extHdfsPort") + sql """drop catalog if exists ${catalog_name};""" + sql """ + create catalog if not exists ${catalog_name} properties ( + 'type'='iceberg', + 'iceberg.catalog.type'='hadoop', + 'warehouse' = 'hdfs://${extHiveHmsHost}:${extHdfsPort}/usr/hive/warehouse/hadoop_catalog' + ); + """ + + logger.info("catalog " + catalog_name + " created") + sql """switch ${catalog_name};""" + logger.info("switched to catalog " + catalog_name) + sql """ use multi_catalog;""" + + + + + + qt_parquet_v1_1 """ desc complex_parquet_v1_schema_change ;""" + qt_parquet_v1_2 """ select * from complex_parquet_v1_schema_change order by id; """ + qt_parquet_v1_3 """ select count(*) from complex_parquet_v1_schema_change ;""" + qt_parquet_v1_4 """ select id from complex_parquet_v1_schema_change where col_add +1 = col_add2 order by id;""" + qt_parquet_v1_5 """ select col_add from complex_parquet_v1_schema_change where col_add is not null order by col_add ; """ + qt_parquet_v1_6 """ select id from complex_parquet_v1_schema_change where col_add + 5 = id order by id; """ + qt_parquet_v1_7 """ select rename_col7 from complex_parquet_v1_schema_change order by id; """ + qt_parquet_v1_8 """ select col_add2 from complex_parquet_v1_schema_change where id >=7 order by id; """ + qt_parquet_v1_9 """ select id,count(col_add) from complex_parquet_v1_schema_change group by id order by id desc ; """ + qt_parquet_v1_10 """ select col_add from complex_parquet_v1_schema_change where col_add -1 = col_add2 order by id; """ + + + + qt_parquet_v2_1 """ desc complex_parquet_v2_schema_change ;""" + qt_parquet_v2_2 """ select * from complex_parquet_v2_schema_change order by id; """ + qt_parquet_v2_3 """ select count(*) from complex_parquet_v2_schema_change ;""" + qt_parquet_v2_4 """ select id from complex_parquet_v2_schema_change where col_add +1 = col_add2 order by id;""" + qt_parquet_v2_5 """ select col_add from complex_parquet_v2_schema_change where col_add is not null order by col_add ; """ + qt_parquet_v2_6 """ select id from complex_parquet_v2_schema_change where col_add + 5 = id order by id; """ + qt_parquet_v2_7 """ select rename_col7 from complex_parquet_v2_schema_change order by id; """ + qt_parquet_v2_8 """ select col_add2 from complex_parquet_v2_schema_change where id >=7 order by id; """ + qt_parquet_v2_9 """ select id,count(col_add) from complex_parquet_v2_schema_change group by id order by id desc ; """ + qt_parquet_v2_10 """ select col_add from complex_parquet_v2_schema_change where col_add -1 = col_add2 order by id; """ + + + + + qt_orc_v1_1 """ desc complex_orc_v1_schema_change ;""" + qt_orc_v1_2 """ select * from complex_orc_v1_schema_change order by id; """ + qt_orc_v1_3 """ select count(*) from complex_orc_v1_schema_change ;""" + qt_orc_v1_4 """ select id from complex_orc_v1_schema_change where col_add +1 = col_add2 order by id;""" + qt_orc_v1_5 """ select col_add from complex_orc_v1_schema_change where col_add is not null order by col_add ; """ + qt_orc_v1_6 """ select id from complex_orc_v1_schema_change where col_add + 5 = id order by id; """ + qt_orc_v1_7 """ select rename_col7 from complex_orc_v1_schema_change order by id; """ + qt_orc_v1_8 """ select col_add2 from complex_orc_v1_schema_change where id >=7 order by id; """ + qt_orc_v1_9 """ select id,count(col_add) from complex_orc_v1_schema_change group by id order by id desc ; """ + qt_orc_v1_10 """ select col_add from complex_orc_v1_schema_change where col_add -1 = col_add2 order by id; """ + + + + qt_orc_v2_1 """ desc complex_orc_v2_schema_change ;""" + qt_orc_v2_2 """ select * from complex_orc_v2_schema_change order by id; """ + qt_orc_v2_3 """ select count(*) from complex_orc_v2_schema_change ;""" + qt_orc_v2_4 """ select id from complex_orc_v2_schema_change where col_add +1 = col_add2 order by id;""" + qt_orc_v2_5 """ select col_add from complex_orc_v2_schema_change where col_add is not null order by col_add ; """ + qt_orc_v2_6 """ select id from complex_orc_v2_schema_change where col_add + 5 = id order by id; """ + qt_orc_v2_7 """ select rename_col7 from complex_orc_v2_schema_change order by id; """ + qt_orc_v2_8 """ select col_add2 from complex_orc_v2_schema_change where id >=7 order by id; """ + qt_orc_v2_9 """ select id,count(col_add) from complex_orc_v2_schema_change group by id order by id desc ; """ + qt_orc_v2_10 """ select col_add from complex_orc_v2_schema_change where col_add -1 = col_add2 order by id; """ + + + + } +} +/* +before schema: + id int, + col1 array<int>, + col2 array<float>, + col3 array<decimal(12,4)>, + col4 map<int,int>, + col5 map<int,float>, + col6 map<int,decimal(12,5)>, + col7 struct<x:int,y:float,z:decimal(12,5)>, + col8 int, + col9 float, + col10 decimal(12,5), + col_del int + + +ALTER TABLE complex_parquet_v2_schema_change CHANGE COLUMN col1.element type bigint; +ALTER TABLE complex_parquet_v2_schema_change CHANGE COLUMN col2.element type double; +ALTER TABLE complex_parquet_v2_schema_change CHANGE COLUMN col3.element type decimal(20,4); +ALTER TABLE complex_parquet_v2_schema_change CHANGE COLUMN col4.value type bigint; +ALTER TABLE complex_parquet_v2_schema_change CHANGE COLUMN col5.value type double; +ALTER TABLE complex_parquet_v2_schema_change CHANGE COLUMN col6.value type decimal(20,5); +ALTER TABLE complex_parquet_v2_schema_change CHANGE COLUMN col7.x type bigint; +ALTER TABLE complex_parquet_v2_schema_change CHANGE COLUMN col7.y type double; +ALTER TABLE complex_parquet_v2_schema_change CHANGE COLUMN col7.z type decimal(20,5); +alter table complex_parquet_v2_schema_change CHANGE COLUMN col8 col8 bigint; +alter table complex_parquet_v2_schema_change CHANGE COLUMN col9 col9 double; +alter table complex_parquet_v2_schema_change CHANGE COLUMN col10 col10 decimal(20,5); +alter table complex_parquet_v2_schema_change drop column col7.z; +alter table complex_parquet_v2_schema_change add column col7.add double; +alter table complex_parquet_v2_schema_change change column col7.add first; +alter table complex_parquet_v2_schema_change rename COLUMN col1 to rename_col1; +alter table complex_parquet_v2_schema_change rename COLUMN col2 to rename_col2; +alter table complex_parquet_v2_schema_change rename COLUMN col3 to rename_col3; +alter table complex_parquet_v2_schema_change rename COLUMN col4 to rename_col4; +alter table complex_parquet_v2_schema_change rename COLUMN col5 to rename_col5; +alter table complex_parquet_v2_schema_change rename COLUMN col6 to rename_col6; +alter table complex_parquet_v2_schema_change rename COLUMN col7 to rename_col7; +alter table complex_parquet_v2_schema_change rename COLUMN col8 to rename_col8; +alter table complex_parquet_v2_schema_change rename COLUMN col9 to rename_col9; +alter table complex_parquet_v2_schema_change rename COLUMN col10 to rename_col10; +alter table complex_parquet_v2_schema_change drop column col_del; +alter table complex_parquet_v2_schema_change CHANGE COLUMN rename_col8 first; +alter table complex_parquet_v2_schema_change CHANGE COLUMN rename_col9 after rename_col8; +alter table complex_parquet_v2_schema_change CHANGE COLUMN rename_col10 after rename_col9; +alter table complex_parquet_v2_schema_change add column col_add int; +alter table complex_parquet_v2_schema_change add column col_add2 int; + +after schema: + rename_col8 bigint + rename_col9 double + rename_col10 decimal(20,5) + id int + rename_col1 array<bigint> + rename_col2 array<double> + rename_col3 array<decimal(20,4)> + rename_col4 map<int,bigint> + rename_col5 map<int,double> + rename_col6 map<int,decimal(20,5)> + rename_col7 struct<add:double,x:bigint,y:double> + col_add int + col_add2 int + +*/ --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org