This is an automated email from the ASF dual-hosted git repository. kakachen pushed a commit to branch nested_column_prune in repository https://gitbox.apache.org/repos/asf/doris.git
commit d234ae2df72e3b248234d9c3b4a136f2a83f9b45 Author: kakachen <[email protected]> AuthorDate: Sun Nov 9 22:01:41 2025 +0800 code refactoring for external table. --- .idea/icon.svg | 7 - .idea/vcs.xml | 22 -- .../exec/format/parquet/vparquet_column_reader.cpp | 2 + .../table/hive/hive_orc_nested_column_utils.cpp | 5 +- .../hive/hive_parquet_nested_column_utils.cpp | 3 +- be/src/vec/exec/format/table/hive_reader.cpp | 222 ++++----------------- .../iceberg/iceberg_orc_nested_column_utils.cpp | 5 +- .../iceberg/iceberg_orc_nested_column_utils.h | 2 - .../iceberg_parquet_nested_column_utils.cpp | 21 +- be/src/vec/exec/format/table/iceberg_reader.cpp | 109 ++-------- .../format/table/nested_column_access_helper.h | 82 ++++++++ 11 files changed, 151 insertions(+), 329 deletions(-) diff --git a/.idea/icon.svg b/.idea/icon.svg deleted file mode 100644 index 7bd05ce60a8..00000000000 --- a/.idea/icon.svg +++ /dev/null @@ -1,7 +0,0 @@ -<?xml version="1.0" encoding="UTF-8"?> -<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd"> -<svg xmlns="http://www.w3.org/2000/svg" version="1.1" width="512px" height="512px" style="shape-rendering:geometricPrecision; text-rendering:geometricPrecision; image-rendering:optimizeQuality; fill-rule:evenodd; clip-rule:evenodd" xmlns:xlink="http://www.w3.org/1999/xlink"> -<g><path style="opacity:0.998" fill="#14a8c9" d="M 177.5,-0.5 C 185.5,-0.5 193.5,-0.5 201.5,-0.5C 216.485,2.49184 229.818,8.99184 241.5,19C 263.362,40.5281 284.862,62.3615 306,84.5C 321.873,104.738 325.54,127.072 317,151.5C 313.365,160.772 308.699,169.439 303,177.5C 287.138,193.029 271.638,208.862 256.5,225C 251.896,228.6 246.896,229.267 241.5,227C 205.667,191.167 169.833,155.333 134,119.5C 114.347,92.1441 112.347,63.4775 128,33.5C 139.766,15.2282 156.266,3.8949 177.5,-0.5 Z"/></g> -<g><path style="opacity:0.991" fill="#5168ac" d="M 80.5,117.5 C 83.5823,118.017 86.5823,118.85 89.5,120C 131.667,162.167 173.833,204.333 216,246.5C 219.574,253.075 219.241,259.408 215,265.5C 173.167,307.333 131.333,349.167 89.5,391C 79.3707,395.174 72.204,392.341 68,382.5C 67.3333,297.833 67.3333,213.167 68,128.5C 69.8906,122.153 74.0573,118.486 80.5,117.5 Z"/></g> -<g><path style="opacity:0.998" fill="#51c9a2" d="M 206.5,511.5 C 197.5,511.5 188.5,511.5 179.5,511.5C 148.554,504.392 128.72,485.392 120,454.5C 114.417,431.171 119.083,410.171 134,391.5C 199.306,325.194 264.972,259.194 331,193.5C 345.241,176.315 354.407,156.648 358.5,134.5C 376.154,152.988 394.321,170.988 413,188.5C 422.048,199.213 430.714,210.213 439,221.5C 452.98,251.039 450.313,279.039 431,305.5C 369.473,367.361 307.639,428.861 245.5,490C 240.124,495.044 234.458,499.711 228.5,504C 221 [...] -</svg> diff --git a/.idea/vcs.xml b/.idea/vcs.xml deleted file mode 100644 index 8b083db2f94..00000000000 --- a/.idea/vcs.xml +++ /dev/null @@ -1,22 +0,0 @@ -<?xml version="1.0" encoding="UTF-8"?> -<project version="4"> - <component name="IssueNavigationConfiguration"> - <option name="links"> - <list> - <IssueNavigationLink> - <option name="issueRegexp" value="#(\d+)" /> - <option name="linkRegexp" value="https://github.com/apache/doris/pull/$1" /> - </IssueNavigationLink> - </list> - </option> - </component> - <component name="VcsDirectoryMappings"> - <mapping directory="$PROJECT_DIR$" vcs="Git" /> - <mapping directory="$PROJECT_DIR$/be/src/apache-orc" vcs="Git" /> - <mapping directory="$PROJECT_DIR$/be/src/clucene" vcs="Git" /> - <mapping directory="$PROJECT_DIR$/contrib/apache-orc" vcs="Git" /> - <mapping directory="$PROJECT_DIR$/contrib/clucene" vcs="Git" /> - <mapping directory="$PROJECT_DIR$/contrib/faiss" vcs="Git" /> - <mapping directory="$PROJECT_DIR$/contrib/openblas" vcs="Git" /> - </component> -</project> \ No newline at end of file diff --git a/be/src/vec/exec/format/parquet/vparquet_column_reader.cpp b/be/src/vec/exec/format/parquet/vparquet_column_reader.cpp index 589570c7ab0..38c53f961be 100644 --- a/be/src/vec/exec/format/parquet/vparquet_column_reader.cpp +++ b/be/src/vec/exec/format/parquet/vparquet_column_reader.cpp @@ -995,6 +995,8 @@ Status StructColumnReader::read_column_data( if (reference_reader != nullptr) { // Read the reference column to get correct RL/DL information + // TODO: Optimize by only reading RL/DL without actual data decoding + // We need to find the FieldSchema for the reference column from _field_schema children FieldSchema* ref_field_schema = nullptr; for (auto& child : _field_schema->children) { diff --git a/be/src/vec/exec/format/table/hive/hive_orc_nested_column_utils.cpp b/be/src/vec/exec/format/table/hive/hive_orc_nested_column_utils.cpp index 1cd02967f0d..9c97747e5a4 100644 --- a/be/src/vec/exec/format/table/hive/hive_orc_nested_column_utils.cpp +++ b/be/src/vec/exec/format/table/hive/hive_orc_nested_column_utils.cpp @@ -55,7 +55,6 @@ void HiveOrcNestedColumnUtils::extract_nested_column_ids( // Normalization logic: // path: ["map_col", "*"] → ["map_col", "VALUES"] + ["map_col", "KEYS"] // path: ["map_col", "*", "field"] → ["map_col", "VALUES", "field"] + ["map_col", "KEYS"] - // KEYS are always needed for correct RL/DL computation when accessing MAP via wildcard if (type.getKind() == orc::TypeKind::MAP) { auto wildcard_it = child_paths_by_table_col_name.find("*"); if (wildcard_it != child_paths_by_table_col_name.end()) { @@ -65,7 +64,7 @@ void HiveOrcNestedColumnUtils::extract_nested_column_ids( auto& values_paths = child_paths_by_table_col_name["VALUES"]; values_paths.insert(values_paths.end(), wildcard_paths.begin(), wildcard_paths.end()); - // Always add KEYS for wildcard access (needed for RL/DL computation) + // Always add KEYS for wildcard access auto& keys_paths = child_paths_by_table_col_name["KEYS"]; // Add an empty path to request full KEYS std::vector<std::string> empty_path; @@ -97,7 +96,7 @@ void HiveOrcNestedColumnUtils::extract_nested_column_ids( } // Special handling for Orc MAP structure: - // When accessing only VALUES, we still need KEY structure for levels + // When accessing only VALUES, we still need KEY structure for deduplicate_keys // Check if we're at key child (i==0) and only VALUES is requested (no KEYS) if (i == 0) { bool has_keys_access = child_paths_by_table_col_name.find("KEYS") != diff --git a/be/src/vec/exec/format/table/hive/hive_parquet_nested_column_utils.cpp b/be/src/vec/exec/format/table/hive/hive_parquet_nested_column_utils.cpp index a0052ecfbfc..c2223c2aa18 100644 --- a/be/src/vec/exec/format/table/hive/hive_parquet_nested_column_utils.cpp +++ b/be/src/vec/exec/format/table/hive/hive_parquet_nested_column_utils.cpp @@ -56,7 +56,6 @@ void HiveParquetNestedColumnUtils::extract_nested_column_ids( // Normalization logic: // path: ["map_col", "*"] → ["map_col", "VALUES"] + ["map_col", "KEYS"] // path: ["map_col", "*", "field"] → ["map_col", "VALUES", "field"] + ["map_col", "KEYS"] - // KEYS are always needed for correct RL/DL computation when accessing MAP via wildcard if (field_schema.data_type->get_primitive_type() == PrimitiveType::TYPE_MAP) { auto wildcard_it = child_paths_by_table_col_name.find("*"); if (wildcard_it != child_paths_by_table_col_name.end()) { @@ -66,7 +65,7 @@ void HiveParquetNestedColumnUtils::extract_nested_column_ids( auto& values_paths = child_paths_by_table_col_name["VALUES"]; values_paths.insert(values_paths.end(), wildcard_paths.begin(), wildcard_paths.end()); - // Always add KEYS for wildcard access (needed for RL/DL computation) + // Always add KEYS for wildcard access auto& keys_paths = child_paths_by_table_col_name["KEYS"]; // Add an empty path to request full KEYS std::vector<std::string> empty_path; diff --git a/be/src/vec/exec/format/table/hive_reader.cpp b/be/src/vec/exec/format/table/hive_reader.cpp index b9bc01fe7b6..9f23c23f0bf 100644 --- a/be/src/vec/exec/format/table/hive_reader.cpp +++ b/be/src/vec/exec/format/table/hive_reader.cpp @@ -23,6 +23,7 @@ #include "runtime/runtime_state.h" #include "vec/exec/format/table/hive/hive_orc_nested_column_utils.h" #include "vec/exec/format/table/hive/hive_parquet_nested_column_utils.h" +#include "vec/exec/format/table/nested_column_access_helper.h" namespace doris::vectorized { #include "common/compile_check_begin.h" @@ -87,7 +88,6 @@ Status HiveOrcReader::init_reader( _create_column_ids_by_top_level_col_index(orc_type_ptr, tuple_descriptor); } - // const auto& file_col_names = column_id_result.column_names; const auto& column_ids = column_id_result.column_ids; const auto& filter_column_ids = column_id_result.filter_column_ids; @@ -99,10 +99,6 @@ Status HiveOrcReader::init_reader( ColumnIdResult HiveOrcReader::_create_column_ids(const orc::Type* orc_type, const TupleDescriptor* tuple_descriptor) { - if (!orc_type) { - return ColumnIdResult(); - } - // map top-level table column name (lower-cased) -> orc::Type* std::unordered_map<std::string, const orc::Type*> table_col_name_to_orc_type_map; for (uint64_t i = 0; i < orc_type->getSubtypeCount(); ++i) { @@ -116,57 +112,26 @@ ColumnIdResult HiveOrcReader::_create_column_ids(const orc::Type* orc_type, std::set<uint64_t> column_ids; std::set<uint64_t> filter_column_ids; - // helper to process name access paths for a given top-level orc field + // helper to process access paths for a given top-level orc field auto process_access_paths = [](const orc::Type* orc_field, const std::vector<TColumnAccessPath>& access_paths, std::set<uint64_t>& out_ids) { - bool access_paths_empty = access_paths.empty(); - - std::vector<std::vector<std::string>> paths; - bool has_top_level_only = false; - for (const auto& access_path : access_paths) { - std::vector<std::string> path; - if (access_path.type == TAccessPathType::DATA) { - path = access_path.data_access_path.path; - } else if (access_path.type == TAccessPathType::META) { - path = access_path.meta_access_path.path; - } else { - continue; - } - std::vector<std::string> remaining_path; - if (path.size() > 1) { - remaining_path.assign(path.begin() + 1, path.end()); - } else { - // only top-level column name => means whole field - remaining_path = std::vector<std::string>(); - } - if (remaining_path.empty()) { - has_top_level_only = true; - } - paths.push_back(std::move(remaining_path)); - } - - if (has_top_level_only || access_paths_empty) { - uint64_t start_id = orc_field->getColumnId(); - uint64_t max_column_id = orc_field->getMaximumColumnId(); - for (uint64_t id = start_id; id <= max_column_id; ++id) { - out_ids.insert(id); - } - } else if (!paths.empty()) { - out_ids.insert(orc_field->getColumnId()); - HiveOrcNestedColumnUtils::extract_nested_column_ids(*orc_field, paths, out_ids); - } + process_nested_access_paths( + orc_field, access_paths, out_ids, + [](const orc::Type* type) { return type->getColumnId(); }, + [](const orc::Type* type) { return type->getMaximumColumnId(); }, + HiveOrcNestedColumnUtils::extract_nested_column_ids); }; for (const auto* slot : tuple_descriptor->slots()) { auto it = table_col_name_to_orc_type_map.find(slot->col_name_lower_case()); if (it == table_col_name_to_orc_type_map.end()) { - // Column not found in file (e.g., partition column, added column) + // Column not found in file continue; } const orc::Type* orc_field = it->second; - // primitive (non-nested) types: direct mapping by name + // primitive (non-nested) types if ((slot->col_type() != TYPE_STRUCT && slot->col_type() != TYPE_ARRAY && slot->col_type() != TYPE_MAP)) { column_ids.insert(orc_field->getColumnId()); @@ -176,7 +141,7 @@ ColumnIdResult HiveOrcReader::_create_column_ids(const orc::Type* orc_type, continue; } - // complex types: + // complex types const auto& all_access_paths = slot->all_access_paths(); process_access_paths(orc_field, all_access_paths, column_ids); @@ -191,13 +156,7 @@ ColumnIdResult HiveOrcReader::_create_column_ids(const orc::Type* orc_type, ColumnIdResult HiveOrcReader::_create_column_ids_by_top_level_col_index( const orc::Type* orc_type, const TupleDescriptor* tuple_descriptor) { - std::shared_ptr<TableSchemaChangeHelper::Node> schema_node = nullptr; - - if (!orc_type) { - return ColumnIdResult(); - } - - // map top-level table column index -> orc::Type* + // map top-level table column position -> orc::Type* std::unordered_map<uint64_t, const orc::Type*> table_col_pos_to_orc_type_map; for (uint64_t i = 0; i < orc_type->getSubtypeCount(); ++i) { auto orc_sub_type = orc_type->getSubtype(i); @@ -209,57 +168,26 @@ ColumnIdResult HiveOrcReader::_create_column_ids_by_top_level_col_index( std::set<uint64_t> column_ids; std::set<uint64_t> filter_column_ids; - // helper to process name access paths for a given top-level orc field + // helper to process access paths for a given top-level orc field auto process_access_paths = [](const orc::Type* orc_field, const std::vector<TColumnAccessPath>& access_paths, std::set<uint64_t>& out_ids) { - bool access_paths_empty = access_paths.empty(); - - std::vector<std::vector<std::string>> paths; - bool has_top_level_only = false; - for (const auto& access_path : access_paths) { - std::vector<std::string> path; - if (access_path.type == TAccessPathType::DATA) { - path = access_path.data_access_path.path; - } else if (access_path.type == TAccessPathType::META) { - path = access_path.meta_access_path.path; - } else { - continue; - } - std::vector<std::string> remaining_path; - if (path.size() > 1) { - remaining_path.assign(path.begin() + 1, path.end()); - } else { - // only top-level column name => means whole field - remaining_path = std::vector<std::string>(); - } - if (remaining_path.empty()) { - has_top_level_only = true; - } - paths.push_back(std::move(remaining_path)); - } - - if (has_top_level_only || access_paths_empty) { - uint64_t start_id = orc_field->getColumnId(); - uint64_t max_column_id = orc_field->getMaximumColumnId(); - for (uint64_t id = start_id; id <= max_column_id; ++id) { - out_ids.insert(id); - } - } else if (!paths.empty()) { - out_ids.insert(orc_field->getColumnId()); - HiveOrcNestedColumnUtils::extract_nested_column_ids(*orc_field, paths, out_ids); - } + process_nested_access_paths( + orc_field, access_paths, out_ids, + [](const orc::Type* type) { return type->getColumnId(); }, + [](const orc::Type* type) { return type->getMaximumColumnId(); }, + HiveOrcNestedColumnUtils::extract_nested_column_ids); }; for (const auto* slot : tuple_descriptor->slots()) { auto it = table_col_pos_to_orc_type_map.find(slot->col_pos()); if (it == table_col_pos_to_orc_type_map.end()) { - // Column not found in file (e.g., partition column, added column) + // Column not found in file continue; } const orc::Type* orc_field = it->second; - // primitive (non-nested) types: direct mapping by pos + // primitive (non-nested) types if ((slot->col_type() != TYPE_STRUCT && slot->col_type() != TYPE_ARRAY && slot->col_type() != TYPE_MAP)) { column_ids.insert(orc_field->getColumnId()); @@ -364,14 +292,11 @@ Status HiveParquetReader::init_reader( ColumnIdResult HiveParquetReader::_create_column_ids(const FieldDescriptor* field_desc, const TupleDescriptor* tuple_descriptor) { - if (!field_desc) { - return ColumnIdResult(); - } - // First, assign column IDs to the field descriptor auto* mutable_field_desc = const_cast<FieldDescriptor*>(field_desc); mutable_field_desc->assign_ids(); + // map top-level table column name (lower-cased) -> FieldSchema* std::unordered_map<std::string, const FieldSchema*> table_col_name_to_field_schema_map; for (int i = 0; i < field_desc->size(); ++i) { auto field_schema = field_desc->get_column(i); @@ -383,58 +308,26 @@ ColumnIdResult HiveParquetReader::_create_column_ids(const FieldDescriptor* fiel std::set<uint64_t> column_ids; std::set<uint64_t> filter_column_ids; - // helper to process name access paths for a given top-level parquet field + // helper to process access paths for a given top-level parquet field auto process_access_paths = [](const FieldSchema* parquet_field, const std::vector<TColumnAccessPath>& access_paths, std::set<uint64_t>& out_ids) { - bool access_paths_empty = access_paths.empty(); - - std::vector<std::vector<std::string>> paths; - bool has_top_level_only = false; - for (const auto& access_path : access_paths) { - std::vector<std::string> path; - if (access_path.type == TAccessPathType::DATA) { - path = access_path.data_access_path.path; - } else if (access_path.type == TAccessPathType::META) { - path = access_path.meta_access_path.path; - } else { - continue; - } - std::vector<std::string> remaining_path; - if (path.size() > 1) { - remaining_path.assign(path.begin() + 1, path.end()); - } else { - // only top-level column name => means whole field - remaining_path = std::vector<std::string>(); - } - if (remaining_path.empty()) { - has_top_level_only = true; - } - paths.push_back(std::move(remaining_path)); - } - - if (has_top_level_only || access_paths_empty) { - uint64_t start_id = parquet_field->get_column_id(); - uint64_t max_column_id = parquet_field->get_max_column_id(); - for (uint64_t id = start_id; id <= max_column_id; ++id) { - out_ids.insert(id); - } - } else if (!paths.empty()) { - out_ids.insert(parquet_field->get_column_id()); - HiveParquetNestedColumnUtils::extract_nested_column_ids(*parquet_field, paths, out_ids); - } + process_nested_access_paths( + parquet_field, access_paths, out_ids, + [](const FieldSchema* field) { return field->get_column_id(); }, + [](const FieldSchema* field) { return field->get_max_column_id(); }, + HiveParquetNestedColumnUtils::extract_nested_column_ids); }; for (const auto* slot : tuple_descriptor->slots()) { - // Find the field schema for this slot (may not exist for partition columns, etc.) auto it = table_col_name_to_field_schema_map.find(slot->col_name_lower_case()); if (it == table_col_name_to_field_schema_map.end()) { - // Column not found in file (e.g., partition column, added column) + // Column not found in file continue; } auto field_schema = it->second; - // primitive (non-nested) types: direct mapping by name + // primitive (non-nested) types if ((slot->col_type() != TYPE_STRUCT && slot->col_type() != TYPE_ARRAY && slot->col_type() != TYPE_MAP)) { column_ids.insert(field_schema->column_id); @@ -445,7 +338,7 @@ ColumnIdResult HiveParquetReader::_create_column_ids(const FieldDescriptor* fiel continue; } - // complex types: + // complex types const auto& all_access_paths = slot->all_access_paths(); process_access_paths(field_schema, all_access_paths, column_ids); @@ -460,16 +353,11 @@ ColumnIdResult HiveParquetReader::_create_column_ids(const FieldDescriptor* fiel ColumnIdResult HiveParquetReader::_create_column_ids_by_top_level_col_index( const FieldDescriptor* field_desc, const TupleDescriptor* tuple_descriptor) { - std::shared_ptr<TableSchemaChangeHelper::Node> schema_node = nullptr; - - if (!field_desc) { - return ColumnIdResult(); - } - // First, assign column IDs to the field descriptor auto* mutable_field_desc = const_cast<FieldDescriptor*>(field_desc); mutable_field_desc->assign_ids(); + // map top-level table column position -> FieldSchema* std::unordered_map<uint64_t, const FieldSchema*> table_col_pos_to_field_schema_map; for (int i = 0; i < field_desc->size(); ++i) { auto field_schema = field_desc->get_column(i); @@ -481,58 +369,26 @@ ColumnIdResult HiveParquetReader::_create_column_ids_by_top_level_col_index( std::set<uint64_t> column_ids; std::set<uint64_t> filter_column_ids; - // helper to process name access paths for a given top-level parquet field + // helper to process access paths for a given top-level parquet field auto process_access_paths = [](const FieldSchema* parquet_field, const std::vector<TColumnAccessPath>& access_paths, std::set<uint64_t>& out_ids) { - bool access_paths_empty = access_paths.empty(); - - std::vector<std::vector<std::string>> paths; - bool has_top_level_only = false; - for (const auto& access_path : access_paths) { - std::vector<std::string> path; - if (access_path.type == TAccessPathType::DATA) { - path = access_path.data_access_path.path; - } else if (access_path.type == TAccessPathType::META) { - path = access_path.meta_access_path.path; - } else { - continue; - } - std::vector<std::string> remaining_path; - if (path.size() > 1) { - remaining_path.assign(path.begin() + 1, path.end()); - } else { - // only top-level column name => means whole field - remaining_path = std::vector<std::string>(); - } - if (remaining_path.empty()) { - has_top_level_only = true; - } - paths.push_back(std::move(remaining_path)); - } - - if (has_top_level_only || access_paths_empty) { - uint64_t start_id = parquet_field->get_column_id(); - uint64_t max_column_id = parquet_field->get_max_column_id(); - for (uint64_t id = start_id; id <= max_column_id; ++id) { - out_ids.insert(id); - } - } else if (!paths.empty()) { - out_ids.insert(parquet_field->get_column_id()); - HiveParquetNestedColumnUtils::extract_nested_column_ids(*parquet_field, paths, out_ids); - } + process_nested_access_paths( + parquet_field, access_paths, out_ids, + [](const FieldSchema* field) { return field->get_column_id(); }, + [](const FieldSchema* field) { return field->get_max_column_id(); }, + HiveParquetNestedColumnUtils::extract_nested_column_ids); }; for (const auto* slot : tuple_descriptor->slots()) { - // Find the field schema for this slot (may not exist for partition columns, etc.) auto it = table_col_pos_to_field_schema_map.find(slot->col_pos()); if (it == table_col_pos_to_field_schema_map.end()) { - // Column not found in file (e.g., partition column, added column) + // Column not found in file continue; } auto field_schema = it->second; - // primitive (non-nested) types: direct mapping by position + // primitive (non-nested) types if ((slot->col_type() != TYPE_STRUCT && slot->col_type() != TYPE_ARRAY && slot->col_type() != TYPE_MAP)) { column_ids.insert(field_schema->column_id); @@ -543,7 +399,7 @@ ColumnIdResult HiveParquetReader::_create_column_ids_by_top_level_col_index( continue; } - // complex types: + // complex types const auto& all_access_paths = slot->all_access_paths(); process_access_paths(field_schema, all_access_paths, column_ids); diff --git a/be/src/vec/exec/format/table/iceberg/iceberg_orc_nested_column_utils.cpp b/be/src/vec/exec/format/table/iceberg/iceberg_orc_nested_column_utils.cpp index e3aacded188..42141589c8a 100644 --- a/be/src/vec/exec/format/table/iceberg/iceberg_orc_nested_column_utils.cpp +++ b/be/src/vec/exec/format/table/iceberg/iceberg_orc_nested_column_utils.cpp @@ -52,7 +52,6 @@ void IcebergOrcNestedColumnUtils::extract_nested_column_ids( // Normalization logic: // path: ["map_col", "*"] → ["map_col", "VALUES"] + ["map_col", "KEYS"] // path: ["map_col", "*", "field"] → ["map_col", "VALUES", "field"] + ["map_col", "KEYS"] - // KEYS are always needed for correct RL/DL computation when accessing MAP via wildcard if (type.getKind() == orc::TypeKind::MAP) { auto wildcard_it = child_paths_by_field_id.find("*"); if (wildcard_it != child_paths_by_field_id.end()) { @@ -62,7 +61,7 @@ void IcebergOrcNestedColumnUtils::extract_nested_column_ids( auto& values_paths = child_paths_by_field_id["VALUES"]; values_paths.insert(values_paths.end(), wildcard_paths.begin(), wildcard_paths.end()); - // Always add KEYS for wildcard access (needed for RL/DL computation) + // Always add KEYS for wildcard access auto& keys_paths = child_paths_by_field_id["KEYS"]; // Add an empty path to request full KEYS std::vector<std::string> empty_path; @@ -98,7 +97,7 @@ void IcebergOrcNestedColumnUtils::extract_nested_column_ids( child_field_id = "VALUES"; } // Special handling for Orc MAP structure: - // When accessing only VALUES, we still need KEY structure for levels + // When accessing only VALUES, we still need KEY structure for deduplicate_keys // Check if we're at key child (i==0) and only VALUES is requested (no KEYS) if (i == 0) { bool has_keys_access = diff --git a/be/src/vec/exec/format/table/iceberg/iceberg_orc_nested_column_utils.h b/be/src/vec/exec/format/table/iceberg/iceberg_orc_nested_column_utils.h index 576ec598970..5a5fe33dd4f 100644 --- a/be/src/vec/exec/format/table/iceberg/iceberg_orc_nested_column_utils.h +++ b/be/src/vec/exec/format/table/iceberg/iceberg_orc_nested_column_utils.h @@ -17,10 +17,8 @@ #pragma once -#include <memory> #include <set> #include <string> -#include <unordered_map> #include <vector> #include "vec/exec/format/table/table_format_reader.h" diff --git a/be/src/vec/exec/format/table/iceberg/iceberg_parquet_nested_column_utils.cpp b/be/src/vec/exec/format/table/iceberg/iceberg_parquet_nested_column_utils.cpp index 7741d498f99..48c508390ea 100644 --- a/be/src/vec/exec/format/table/iceberg/iceberg_parquet_nested_column_utils.cpp +++ b/be/src/vec/exec/format/table/iceberg/iceberg_parquet_nested_column_utils.cpp @@ -8,7 +8,7 @@ // // http://www.apache.org/licenses/LICENSE-2.0 // -// Unless required by applicable law or agreed in writing, +// Unless required by applicable law or agreed to in writing, // software distributed under the License is distributed on an // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, either express or implied. See the License for the @@ -56,7 +56,6 @@ void IcebergParquetNestedColumnUtils::extract_nested_column_ids( // Normalization logic: // path: ["map_col", "*"] → ["map_col", "VALUES"] + ["map_col", "KEYS"] // path: ["map_col", "*", "field"] → ["map_col", "VALUES", "field"] + ["map_col", "KEYS"] - // KEYS are always needed for correct RL/DL computation when accessing MAP via wildcard if (field_schema.data_type->get_primitive_type() == PrimitiveType::TYPE_MAP) { auto wildcard_it = child_paths_by_field_id.find("*"); if (wildcard_it != child_paths_by_field_id.end()) { @@ -66,7 +65,7 @@ void IcebergParquetNestedColumnUtils::extract_nested_column_ids( auto& values_paths = child_paths_by_field_id["VALUES"]; values_paths.insert(values_paths.end(), wildcard_paths.begin(), wildcard_paths.end()); - // Always add KEYS for wildcard access (needed for RL/DL computation) + // Always add KEYS for wildcard access auto& keys_paths = child_paths_by_field_id["KEYS"]; // Add an empty path to request full KEYS std::vector<std::string> empty_path; @@ -114,10 +113,7 @@ void IcebergParquetNestedColumnUtils::extract_nested_column_ids( uint64_t key_start_id = child.get_column_id(); uint64_t key_max_id = child.get_max_column_id(); for (uint64_t id = key_start_id; id <= key_max_id; ++id) { - auto inserted = column_ids.insert(id); - if (inserted.second) { - std::cout << "[IcebergNested] added column id: " << id << std::endl; - } + column_ids.insert(id); } has_child_columns = true; continue; // Skip further processing of key child @@ -147,10 +143,7 @@ void IcebergParquetNestedColumnUtils::extract_nested_column_ids( uint64_t start_id = child.get_column_id(); uint64_t max_column_id = child.get_max_column_id(); for (uint64_t id = start_id; id <= max_column_id; ++id) { - auto inserted = column_ids.insert(id); - if (inserted.second) { - std::cout << "[IcebergNested] added column id: " << id << std::endl; - } + column_ids.insert(id); } has_child_columns = true; } else { @@ -172,11 +165,7 @@ void IcebergParquetNestedColumnUtils::extract_nested_column_ids( // This ensures parent struct/container nodes are included when their children are needed if (has_child_columns) { // Set automatically handles deduplication, so no need to check if it already exists - auto inserted = column_ids.insert(field_schema.get_column_id()); - if (inserted.second) { - std::cout << "[IcebergNested] added parent column id: " << field_schema.get_column_id() - << std::endl; - } + column_ids.insert(field_schema.get_column_id()); } } diff --git a/be/src/vec/exec/format/table/iceberg_reader.cpp b/be/src/vec/exec/format/table/iceberg_reader.cpp index 1a010dc4454..4c8a8a584db 100644 --- a/be/src/vec/exec/format/table/iceberg_reader.cpp +++ b/be/src/vec/exec/format/table/iceberg_reader.cpp @@ -55,6 +55,7 @@ #include "vec/exec/format/parquet/vparquet_column_chunk_reader.h" #include "vec/exec/format/table/iceberg/iceberg_orc_nested_column_utils.h" #include "vec/exec/format/table/iceberg/iceberg_parquet_nested_column_utils.h" +#include "vec/exec/format/table/nested_column_access_helper.h" #include "vec/exec/format/table/table_format_reader.h" namespace cctz { @@ -450,7 +451,6 @@ Status IcebergParquetReader::init_reader( _all_required_col_names = file_col_names; auto column_id_result = _create_column_ids(field_desc, tuple_descriptor); - // const auto& file_col_names = column_id_result.column_names; auto& column_ids = column_id_result.column_ids; const auto& filter_column_ids = column_id_result.filter_column_ids; @@ -471,15 +471,11 @@ Status IcebergParquetReader::init_reader( ColumnIdResult IcebergParquetReader::_create_column_ids(const FieldDescriptor* field_desc, const TupleDescriptor* tuple_descriptor) { - if (!field_desc) { - return ColumnIdResult(); - } - // First, assign column IDs to the field descriptor auto* mutable_field_desc = const_cast<FieldDescriptor*>(field_desc); mutable_field_desc->assign_ids(); - // Create a mapping from iceberg field_id to FieldSchema for quick lookup + // map top-level table column iceberg_id -> FieldSchema* std::unordered_map<int, const FieldSchema*> iceberg_id_to_field_schema_map; for (int i = 0; i < field_desc->size(); ++i) { @@ -493,47 +489,15 @@ ColumnIdResult IcebergParquetReader::_create_column_ids(const FieldDescriptor* f std::set<uint64_t> column_ids; std::set<uint64_t> filter_column_ids; - // helper to process name access paths for a given top-level parquet field + // helper to process access paths for a given top-level parquet field auto process_access_paths = [](const FieldSchema* parquet_field, const std::vector<TColumnAccessPath>& access_paths, std::set<uint64_t>& out_ids) { - bool access_paths_empty = access_paths.empty(); - - std::vector<std::vector<std::string>> paths; - bool has_top_level_only = false; - for (const auto& access_path : access_paths) { - std::vector<std::string> path; - if (access_path.type == TAccessPathType::DATA) { - path = access_path.data_access_path.path; - } else if (access_path.type == TAccessPathType::META) { - path = access_path.meta_access_path.path; - } else { - continue; - } - std::vector<std::string> remaining_path; - if (path.size() > 1) { - remaining_path.assign(path.begin() + 1, path.end()); - } else { - // only top-level column name => means whole field - remaining_path = std::vector<std::string>(); - } - if (remaining_path.empty()) { - has_top_level_only = true; - } - paths.push_back(std::move(remaining_path)); - } - - if (has_top_level_only || access_paths_empty) { - uint64_t start_id = parquet_field->get_column_id(); - uint64_t max_column_id = parquet_field->get_max_column_id(); - for (uint64_t id = start_id; id <= max_column_id; ++id) { - out_ids.insert(id); - } - } else if (!paths.empty()) { - out_ids.insert(parquet_field->get_column_id()); - IcebergParquetNestedColumnUtils::extract_nested_column_ids(*parquet_field, paths, - out_ids); - } + process_nested_access_paths( + parquet_field, access_paths, out_ids, + [](const FieldSchema* field) { return field->get_column_id(); }, + [](const FieldSchema* field) { return field->get_max_column_id(); }, + IcebergParquetNestedColumnUtils::extract_nested_column_ids); }; for (const auto* slot : tuple_descriptor->slots()) { @@ -663,13 +627,7 @@ Status IcebergOrcReader::init_reader( ColumnIdResult IcebergOrcReader::_create_column_ids(const orc::Type* orc_type, const TupleDescriptor* tuple_descriptor) { - std::shared_ptr<TableSchemaChangeHelper::Node> schema_node = nullptr; - - if (!orc_type) { - return ColumnIdResult(); - } - - // map top-level table column name (lower-cased) -> orc::Type* + // map top-level table column iceberg_id -> orc::Type* std::unordered_map<int, const orc::Type*> iceberg_id_to_orc_type_map; for (uint64_t i = 0; i < orc_type->getSubtypeCount(); ++i) { auto orc_sub_type = orc_type->getSubtype(i); @@ -685,57 +643,26 @@ ColumnIdResult IcebergOrcReader::_create_column_ids(const orc::Type* orc_type, std::set<uint64_t> column_ids; std::set<uint64_t> filter_column_ids; - // helper to process name access paths for a given top-level orc field + // helper to process access paths for a given top-level orc field auto process_access_paths = [](const orc::Type* orc_field, const std::vector<TColumnAccessPath>& access_paths, std::set<uint64_t>& out_ids) { - bool access_paths_empty = access_paths.empty(); - - std::vector<std::vector<std::string>> paths; - bool has_top_level_only = false; - for (const auto& access_path : access_paths) { - std::vector<std::string> path; - if (access_path.type == TAccessPathType::DATA) { - path = access_path.data_access_path.path; - } else if (access_path.type == TAccessPathType::META) { - path = access_path.meta_access_path.path; - } else { - continue; - } - std::vector<std::string> remaining_path; - if (path.size() > 1) { - remaining_path.assign(path.begin() + 1, path.end()); - } else { - // only top-level column name => means whole field - remaining_path = std::vector<std::string>(); - } - if (remaining_path.empty()) { - has_top_level_only = true; - } - paths.push_back(std::move(remaining_path)); - } - - if (has_top_level_only || access_paths_empty) { - uint64_t start_id = orc_field->getColumnId(); - uint64_t max_column_id = orc_field->getMaximumColumnId(); - for (uint64_t id = start_id; id <= max_column_id; ++id) { - out_ids.insert(id); - } - } else if (!paths.empty()) { - out_ids.insert(orc_field->getColumnId()); - IcebergOrcNestedColumnUtils::extract_nested_column_ids(*orc_field, paths, out_ids); - } + process_nested_access_paths( + orc_field, access_paths, out_ids, + [](const orc::Type* type) { return type->getColumnId(); }, + [](const orc::Type* type) { return type->getMaximumColumnId(); }, + IcebergOrcNestedColumnUtils::extract_nested_column_ids); }; for (const auto* slot : tuple_descriptor->slots()) { auto it = iceberg_id_to_orc_type_map.find(slot->col_unique_id()); if (it == iceberg_id_to_orc_type_map.end()) { - // Column not found in file (e.g., partition column, added column) + // Column not found in file continue; } const orc::Type* orc_field = it->second; - // primitive (non-nested) types: direct mapping by name + // primitive (non-nested) types if ((slot->col_type() != TYPE_STRUCT && slot->col_type() != TYPE_ARRAY && slot->col_type() != TYPE_MAP)) { column_ids.insert(orc_field->getColumnId()); @@ -745,7 +672,7 @@ ColumnIdResult IcebergOrcReader::_create_column_ids(const orc::Type* orc_type, continue; } - // complex types: + // complex types const auto& all_access_paths = slot->all_access_paths(); process_access_paths(orc_field, all_access_paths, column_ids); diff --git a/be/src/vec/exec/format/table/nested_column_access_helper.h b/be/src/vec/exec/format/table/nested_column_access_helper.h new file mode 100644 index 00000000000..316cfb7aebe --- /dev/null +++ b/be/src/vec/exec/format/table/nested_column_access_helper.h @@ -0,0 +1,82 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include <set> +#include <string> +#include <vector> + +#include "vec/exec/format/table/table_format_reader.h" + +namespace doris::vectorized { +#include "common/compile_check_begin.h" + +// Helper that normalizes access paths and delegates nested column id extraction. +// The caller provides how to access the column id range for the concrete field type +// (Parquet FieldSchema, ORC Type, etc.) plus a nested extractor implementation. +template <typename FieldType, typename ColumnIdGetter, typename MaxColumnIdGetter, + typename ExtractNestedFunc> +void process_nested_access_paths(const FieldType* field, + const std::vector<TColumnAccessPath>& access_paths, + std::set<uint64_t>& out_ids, ColumnIdGetter&& column_id_getter, + MaxColumnIdGetter&& max_column_id_getter, + ExtractNestedFunc&& extract_nested) { + if (field == nullptr) { + return; + } + + const bool access_paths_empty = access_paths.empty(); + std::vector<std::vector<std::string>> paths; + paths.reserve(access_paths.size()); + bool has_top_level_only = false; + + for (const auto& access_path : access_paths) { + const std::vector<std::string>* path_ptr = nullptr; + if (access_path.type == TAccessPathType::DATA) { + path_ptr = &access_path.data_access_path.path; + } else if (access_path.type == TAccessPathType::META) { + path_ptr = &access_path.meta_access_path.path; + } else { + continue; + } + + const auto& path = *path_ptr; + std::vector<std::string> remaining_path; + if (path.size() > 1) { + remaining_path.assign(path.begin() + 1, path.end()); + } + if (remaining_path.empty()) { + has_top_level_only = true; + } + paths.push_back(std::move(remaining_path)); + } + + const uint64_t column_id = column_id_getter(field); + if (has_top_level_only || access_paths_empty) { + const uint64_t max_column_id = max_column_id_getter(field); + for (uint64_t id = column_id; id <= max_column_id; ++id) { + out_ids.insert(id); + } + } else if (!paths.empty()) { + out_ids.insert(column_id); + extract_nested(*field, paths, out_ids); + } +} + +#include "common/compile_check_end.h" +} // namespace doris::vectorized --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
