This is an automated email from the ASF dual-hosted git repository. morningman pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push: new de8d884ec3 [Fix](multi catalog)Fix iceberg parquet file doesn't have iceberg.schema meta problem (#16764) de8d884ec3 is described below commit de8d884ec3101f180078dfaf32b1ab9757a1bef7 Author: Jibing-Li <64681310+jibing...@users.noreply.github.com> AuthorDate: Thu Feb 16 00:08:59 2023 +0800 [Fix](multi catalog)Fix iceberg parquet file doesn't have iceberg.schema meta problem (#16764) To support schema evolution, Iceberg add schema information to Parquet file metadata. But for early iceberg version, it doesn't write any schema information to Parquet file. This PR is to support read parquet without schema information. --- be/src/vec/exec/format/parquet/vparquet_reader.cpp | 5 +++-- be/src/vec/exec/format/table/iceberg_reader.cpp | 5 ++++- be/src/vec/exec/format/table/iceberg_reader.h | 1 + 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/be/src/vec/exec/format/parquet/vparquet_reader.cpp b/be/src/vec/exec/format/parquet/vparquet_reader.cpp index 2d149d7ba5..3a5d45bfa4 100644 --- a/be/src/vec/exec/format/parquet/vparquet_reader.cpp +++ b/be/src/vec/exec/format/parquet/vparquet_reader.cpp @@ -207,8 +207,9 @@ Status ParquetReader::init_reader( auto name = schema_desc.get_column(i)->name; // If the column in parquet file is included in all_column_names and not in missing_column_names, // add it to _map_column, which means the reader should read the data of this column. - // Here to check against missing_column_names is to for the 'Add a column with back to the table - // with the same column name' case. Shouldn't read this column data in this case. + // Here to check against missing_column_names is for the 'Add a column back to the table + // with the same column name' case. (drop column a then add column a). + // Shouldn't read this column data in this case. if (find(all_column_names.begin(), all_column_names.end(), name) != all_column_names.end() && find(missing_column_names.begin(), missing_column_names.end(), name) == diff --git a/be/src/vec/exec/format/table/iceberg_reader.cpp b/be/src/vec/exec/format/table/iceberg_reader.cpp index 0035323432..ce36da6a92 100644 --- a/be/src/vec/exec/format/table/iceberg_reader.cpp +++ b/be/src/vec/exec/format/table/iceberg_reader.cpp @@ -425,6 +425,7 @@ Status IcebergTableReader::_gen_col_name_maps(std::vector<tparquet::KeyValue> pa for (int i = 0; i < parquet_meta_kv.size(); ++i) { tparquet::KeyValue kv = parquet_meta_kv[i]; if (kv.key == "iceberg.schema") { + _has_iceberg_schema = true; std::string schema = kv.value; rapidjson::Document json; json.Parse(schema.c_str()); @@ -478,7 +479,9 @@ void IcebergTableReader::_gen_file_col_names() { auto iter = _table_col_to_file_col.find(name); if (iter == _table_col_to_file_col.end()) { _all_required_col_names.emplace_back(name); - _not_in_file_col_names.emplace_back(name); + if (_has_iceberg_schema) { + _not_in_file_col_names.emplace_back(name); + } } else { _all_required_col_names.emplace_back(iter->second); } diff --git a/be/src/vec/exec/format/table/iceberg_reader.h b/be/src/vec/exec/format/table/iceberg_reader.h index f9e480f28b..b161c06845 100644 --- a/be/src/vec/exec/format/table/iceberg_reader.h +++ b/be/src/vec/exec/format/table/iceberg_reader.h @@ -118,6 +118,7 @@ private: IOContext* _io_ctx; bool _has_schema_change = false; + bool _has_iceberg_schema = false; }; } // namespace vectorized } // namespace doris --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org