(doris) 01/01: code refactoring for external table.

kakachen Sun, 09 Nov 2025 18:39:22 -0800

This is an automated email from the ASF dual-hosted git repository.

kakachen pushed a commit to branch nested_column_prune
in repository https://gitbox.apache.org/repos/asf/doris.git


commit d234ae2df72e3b248234d9c3b4a136f2a83f9b45
Author: kakachen <[email protected]>
AuthorDate: Sun Nov 9 22:01:41 2025 +0800

    code refactoring for external table.
---
 .idea/icon.svg                                     |   7 -
 .idea/vcs.xml                                      |  22 --
 .../exec/format/parquet/vparquet_column_reader.cpp |   2 +
 .../table/hive/hive_orc_nested_column_utils.cpp    |   5 +-
 .../hive/hive_parquet_nested_column_utils.cpp      |   3 +-
 be/src/vec/exec/format/table/hive_reader.cpp       | 222 ++++-----------------
 .../iceberg/iceberg_orc_nested_column_utils.cpp    |   5 +-
 .../iceberg/iceberg_orc_nested_column_utils.h      |   2 -
 .../iceberg_parquet_nested_column_utils.cpp        |  21 +-
 be/src/vec/exec/format/table/iceberg_reader.cpp    | 109 ++--------
 .../format/table/nested_column_access_helper.h     |  82 ++++++++
 11 files changed, 151 insertions(+), 329 deletions(-)

diff --git a/.idea/icon.svg b/.idea/icon.svg
deleted file mode 100644
index 7bd05ce60a8..00000000000
--- a/.idea/icon.svg
+++ /dev/null
@@ -1,7 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" 
"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd";>
-<svg xmlns="http://www.w3.org/2000/svg"; version="1.1" width="512px" 
height="512px" style="shape-rendering:geometricPrecision; 
text-rendering:geometricPrecision; image-rendering:optimizeQuality; 
fill-rule:evenodd; clip-rule:evenodd" 
xmlns:xlink="http://www.w3.org/1999/xlink";>
-<g><path style="opacity:0.998" fill="#14a8c9" d="M 177.5,-0.5 C 185.5,-0.5 
193.5,-0.5 201.5,-0.5C 216.485,2.49184 229.818,8.99184 241.5,19C 
263.362,40.5281 284.862,62.3615 306,84.5C 321.873,104.738 325.54,127.072 
317,151.5C 313.365,160.772 308.699,169.439 303,177.5C 287.138,193.029 
271.638,208.862 256.5,225C 251.896,228.6 246.896,229.267 241.5,227C 
205.667,191.167 169.833,155.333 134,119.5C 114.347,92.1441 112.347,63.4775 
128,33.5C 139.766,15.2282 156.266,3.8949 177.5,-0.5 Z"/></g>
-<g><path style="opacity:0.991" fill="#5168ac" d="M 80.5,117.5 C 
83.5823,118.017 86.5823,118.85 89.5,120C 131.667,162.167 173.833,204.333 
216,246.5C 219.574,253.075 219.241,259.408 215,265.5C 173.167,307.333 
131.333,349.167 89.5,391C 79.3707,395.174 72.204,392.341 68,382.5C 
67.3333,297.833 67.3333,213.167 68,128.5C 69.8906,122.153 74.0573,118.486 
80.5,117.5 Z"/></g>
-<g><path style="opacity:0.998" fill="#51c9a2" d="M 206.5,511.5 C 197.5,511.5 
188.5,511.5 179.5,511.5C 148.554,504.392 128.72,485.392 120,454.5C 
114.417,431.171 119.083,410.171 134,391.5C 199.306,325.194 264.972,259.194 
331,193.5C 345.241,176.315 354.407,156.648 358.5,134.5C 376.154,152.988 
394.321,170.988 413,188.5C 422.048,199.213 430.714,210.213 439,221.5C 
452.98,251.039 450.313,279.039 431,305.5C 369.473,367.361 307.639,428.861 
245.5,490C 240.124,495.044 234.458,499.711 228.5,504C 221 [...]
-</svg>
diff --git a/.idea/vcs.xml b/.idea/vcs.xml
deleted file mode 100644
index 8b083db2f94..00000000000
--- a/.idea/vcs.xml
+++ /dev/null
@@ -1,22 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<project version="4">
-  <component name="IssueNavigationConfiguration">
-    <option name="links">
-      <list>
-        <IssueNavigationLink>
-          <option name="issueRegexp" value="#(\d+)" />
-          <option name="linkRegexp" 
value="https://github.com/apache/doris/pull/$1"; />
-        </IssueNavigationLink>
-      </list>
-    </option>
-  </component>
-  <component name="VcsDirectoryMappings">
-    <mapping directory="$PROJECT_DIR$" vcs="Git" />
-    <mapping directory="$PROJECT_DIR$/be/src/apache-orc" vcs="Git" />
-    <mapping directory="$PROJECT_DIR$/be/src/clucene" vcs="Git" />
-    <mapping directory="$PROJECT_DIR$/contrib/apache-orc" vcs="Git" />
-    <mapping directory="$PROJECT_DIR$/contrib/clucene" vcs="Git" />
-    <mapping directory="$PROJECT_DIR$/contrib/faiss" vcs="Git" />
-    <mapping directory="$PROJECT_DIR$/contrib/openblas" vcs="Git" />
-  </component>
-</project>
\ No newline at end of file
diff --git a/be/src/vec/exec/format/parquet/vparquet_column_reader.cpp 
b/be/src/vec/exec/format/parquet/vparquet_column_reader.cpp
index 589570c7ab0..38c53f961be 100644
--- a/be/src/vec/exec/format/parquet/vparquet_column_reader.cpp
+++ b/be/src/vec/exec/format/parquet/vparquet_column_reader.cpp
@@ -995,6 +995,8 @@ Status StructColumnReader::read_column_data(
 
         if (reference_reader != nullptr) {
             // Read the reference column to get correct RL/DL information
+            // TODO: Optimize by only reading RL/DL without actual data 
decoding
+
             // We need to find the FieldSchema for the reference column from 
_field_schema children
             FieldSchema* ref_field_schema = nullptr;
             for (auto& child : _field_schema->children) {
diff --git a/be/src/vec/exec/format/table/hive/hive_orc_nested_column_utils.cpp 
b/be/src/vec/exec/format/table/hive/hive_orc_nested_column_utils.cpp
index 1cd02967f0d..9c97747e5a4 100644
--- a/be/src/vec/exec/format/table/hive/hive_orc_nested_column_utils.cpp
+++ b/be/src/vec/exec/format/table/hive/hive_orc_nested_column_utils.cpp
@@ -55,7 +55,6 @@ void HiveOrcNestedColumnUtils::extract_nested_column_ids(
     // Normalization logic:
     //   path: ["map_col", "*"]              → ["map_col", "VALUES"] + 
["map_col", "KEYS"]
     //   path: ["map_col", "*", "field"]     → ["map_col", "VALUES", "field"] 
+ ["map_col", "KEYS"]
-    // KEYS are always needed for correct RL/DL computation when accessing MAP 
via wildcard
     if (type.getKind() == orc::TypeKind::MAP) {
         auto wildcard_it = child_paths_by_table_col_name.find("*");
         if (wildcard_it != child_paths_by_table_col_name.end()) {
@@ -65,7 +64,7 @@ void HiveOrcNestedColumnUtils::extract_nested_column_ids(
             auto& values_paths = child_paths_by_table_col_name["VALUES"];
             values_paths.insert(values_paths.end(), wildcard_paths.begin(), 
wildcard_paths.end());
 
-            // Always add KEYS for wildcard access (needed for RL/DL 
computation)
+            // Always add KEYS for wildcard access
             auto& keys_paths = child_paths_by_table_col_name["KEYS"];
             // Add an empty path to request full KEYS
             std::vector<std::string> empty_path;
@@ -97,7 +96,7 @@ void HiveOrcNestedColumnUtils::extract_nested_column_ids(
             }
 
             // Special handling for Orc MAP structure:
-            // When accessing only VALUES, we still need KEY structure for 
levels
+            // When accessing only VALUES, we still need KEY structure for 
deduplicate_keys
             // Check if we're at key child (i==0) and only VALUES is requested 
(no KEYS)
             if (i == 0) {
                 bool has_keys_access = 
child_paths_by_table_col_name.find("KEYS") !=
diff --git 
a/be/src/vec/exec/format/table/hive/hive_parquet_nested_column_utils.cpp 
b/be/src/vec/exec/format/table/hive/hive_parquet_nested_column_utils.cpp
index a0052ecfbfc..c2223c2aa18 100644
--- a/be/src/vec/exec/format/table/hive/hive_parquet_nested_column_utils.cpp
+++ b/be/src/vec/exec/format/table/hive/hive_parquet_nested_column_utils.cpp
@@ -56,7 +56,6 @@ void HiveParquetNestedColumnUtils::extract_nested_column_ids(
     // Normalization logic:
     //   path: ["map_col", "*"]              → ["map_col", "VALUES"] + 
["map_col", "KEYS"]
     //   path: ["map_col", "*", "field"]     → ["map_col", "VALUES", "field"] 
+ ["map_col", "KEYS"]
-    // KEYS are always needed for correct RL/DL computation when accessing MAP 
via wildcard
     if (field_schema.data_type->get_primitive_type() == 
PrimitiveType::TYPE_MAP) {
         auto wildcard_it = child_paths_by_table_col_name.find("*");
         if (wildcard_it != child_paths_by_table_col_name.end()) {
@@ -66,7 +65,7 @@ void HiveParquetNestedColumnUtils::extract_nested_column_ids(
             auto& values_paths = child_paths_by_table_col_name["VALUES"];
             values_paths.insert(values_paths.end(), wildcard_paths.begin(), 
wildcard_paths.end());
 
-            // Always add KEYS for wildcard access (needed for RL/DL 
computation)
+            // Always add KEYS for wildcard access
             auto& keys_paths = child_paths_by_table_col_name["KEYS"];
             // Add an empty path to request full KEYS
             std::vector<std::string> empty_path;
diff --git a/be/src/vec/exec/format/table/hive_reader.cpp 
b/be/src/vec/exec/format/table/hive_reader.cpp
index b9bc01fe7b6..9f23c23f0bf 100644
--- a/be/src/vec/exec/format/table/hive_reader.cpp
+++ b/be/src/vec/exec/format/table/hive_reader.cpp
@@ -23,6 +23,7 @@
 #include "runtime/runtime_state.h"
 #include "vec/exec/format/table/hive/hive_orc_nested_column_utils.h"
 #include "vec/exec/format/table/hive/hive_parquet_nested_column_utils.h"
+#include "vec/exec/format/table/nested_column_access_helper.h"
 
 namespace doris::vectorized {
 #include "common/compile_check_begin.h"
@@ -87,7 +88,6 @@ Status HiveOrcReader::init_reader(
                 _create_column_ids_by_top_level_col_index(orc_type_ptr, 
tuple_descriptor);
     }
 
-    // const auto& file_col_names = column_id_result.column_names;
     const auto& column_ids = column_id_result.column_ids;
     const auto& filter_column_ids = column_id_result.filter_column_ids;
 
@@ -99,10 +99,6 @@ Status HiveOrcReader::init_reader(
 
 ColumnIdResult HiveOrcReader::_create_column_ids(const orc::Type* orc_type,
                                                  const TupleDescriptor* 
tuple_descriptor) {
-    if (!orc_type) {
-        return ColumnIdResult();
-    }
-
     // map top-level table column name (lower-cased) -> orc::Type*
     std::unordered_map<std::string, const orc::Type*> 
table_col_name_to_orc_type_map;
     for (uint64_t i = 0; i < orc_type->getSubtypeCount(); ++i) {
@@ -116,57 +112,26 @@ ColumnIdResult HiveOrcReader::_create_column_ids(const 
orc::Type* orc_type,
     std::set<uint64_t> column_ids;
     std::set<uint64_t> filter_column_ids;
 
-    // helper to process name access paths for a given top-level orc field
+    // helper to process access paths for a given top-level orc field
     auto process_access_paths = [](const orc::Type* orc_field,
                                    const std::vector<TColumnAccessPath>& 
access_paths,
                                    std::set<uint64_t>& out_ids) {
-        bool access_paths_empty = access_paths.empty();
-
-        std::vector<std::vector<std::string>> paths;
-        bool has_top_level_only = false;
-        for (const auto& access_path : access_paths) {
-            std::vector<std::string> path;
-            if (access_path.type == TAccessPathType::DATA) {
-                path = access_path.data_access_path.path;
-            } else if (access_path.type == TAccessPathType::META) {
-                path = access_path.meta_access_path.path;
-            } else {
-                continue;
-            }
-            std::vector<std::string> remaining_path;
-            if (path.size() > 1) {
-                remaining_path.assign(path.begin() + 1, path.end());
-            } else {
-                // only top-level column name => means whole field
-                remaining_path = std::vector<std::string>();
-            }
-            if (remaining_path.empty()) {
-                has_top_level_only = true;
-            }
-            paths.push_back(std::move(remaining_path));
-        }
-
-        if (has_top_level_only || access_paths_empty) {
-            uint64_t start_id = orc_field->getColumnId();
-            uint64_t max_column_id = orc_field->getMaximumColumnId();
-            for (uint64_t id = start_id; id <= max_column_id; ++id) {
-                out_ids.insert(id);
-            }
-        } else if (!paths.empty()) {
-            out_ids.insert(orc_field->getColumnId());
-            HiveOrcNestedColumnUtils::extract_nested_column_ids(*orc_field, 
paths, out_ids);
-        }
+        process_nested_access_paths(
+                orc_field, access_paths, out_ids,
+                [](const orc::Type* type) { return type->getColumnId(); },
+                [](const orc::Type* type) { return type->getMaximumColumnId(); 
},
+                HiveOrcNestedColumnUtils::extract_nested_column_ids);
     };
 
     for (const auto* slot : tuple_descriptor->slots()) {
         auto it = 
table_col_name_to_orc_type_map.find(slot->col_name_lower_case());
         if (it == table_col_name_to_orc_type_map.end()) {
-            // Column not found in file (e.g., partition column, added column)
+            // Column not found in file
             continue;
         }
         const orc::Type* orc_field = it->second;
 
-        // primitive (non-nested) types: direct mapping by name
+        // primitive (non-nested) types
         if ((slot->col_type() != TYPE_STRUCT && slot->col_type() != TYPE_ARRAY 
&&
              slot->col_type() != TYPE_MAP)) {
             column_ids.insert(orc_field->getColumnId());
@@ -176,7 +141,7 @@ ColumnIdResult HiveOrcReader::_create_column_ids(const 
orc::Type* orc_type,
             continue;
         }
 
-        // complex types:
+        // complex types
         const auto& all_access_paths = slot->all_access_paths();
         process_access_paths(orc_field, all_access_paths, column_ids);
 
@@ -191,13 +156,7 @@ ColumnIdResult HiveOrcReader::_create_column_ids(const 
orc::Type* orc_type,
 
 ColumnIdResult HiveOrcReader::_create_column_ids_by_top_level_col_index(
         const orc::Type* orc_type, const TupleDescriptor* tuple_descriptor) {
-    std::shared_ptr<TableSchemaChangeHelper::Node> schema_node = nullptr;
-
-    if (!orc_type) {
-        return ColumnIdResult();
-    }
-
-    // map top-level table column index -> orc::Type*
+    // map top-level table column position -> orc::Type*
     std::unordered_map<uint64_t, const orc::Type*> 
table_col_pos_to_orc_type_map;
     for (uint64_t i = 0; i < orc_type->getSubtypeCount(); ++i) {
         auto orc_sub_type = orc_type->getSubtype(i);
@@ -209,57 +168,26 @@ ColumnIdResult 
HiveOrcReader::_create_column_ids_by_top_level_col_index(
     std::set<uint64_t> column_ids;
     std::set<uint64_t> filter_column_ids;
 
-    // helper to process name access paths for a given top-level orc field
+    // helper to process access paths for a given top-level orc field
     auto process_access_paths = [](const orc::Type* orc_field,
                                    const std::vector<TColumnAccessPath>& 
access_paths,
                                    std::set<uint64_t>& out_ids) {
-        bool access_paths_empty = access_paths.empty();
-
-        std::vector<std::vector<std::string>> paths;
-        bool has_top_level_only = false;
-        for (const auto& access_path : access_paths) {
-            std::vector<std::string> path;
-            if (access_path.type == TAccessPathType::DATA) {
-                path = access_path.data_access_path.path;
-            } else if (access_path.type == TAccessPathType::META) {
-                path = access_path.meta_access_path.path;
-            } else {
-                continue;
-            }
-            std::vector<std::string> remaining_path;
-            if (path.size() > 1) {
-                remaining_path.assign(path.begin() + 1, path.end());
-            } else {
-                // only top-level column name => means whole field
-                remaining_path = std::vector<std::string>();
-            }
-            if (remaining_path.empty()) {
-                has_top_level_only = true;
-            }
-            paths.push_back(std::move(remaining_path));
-        }
-
-        if (has_top_level_only || access_paths_empty) {
-            uint64_t start_id = orc_field->getColumnId();
-            uint64_t max_column_id = orc_field->getMaximumColumnId();
-            for (uint64_t id = start_id; id <= max_column_id; ++id) {
-                out_ids.insert(id);
-            }
-        } else if (!paths.empty()) {
-            out_ids.insert(orc_field->getColumnId());
-            HiveOrcNestedColumnUtils::extract_nested_column_ids(*orc_field, 
paths, out_ids);
-        }
+        process_nested_access_paths(
+                orc_field, access_paths, out_ids,
+                [](const orc::Type* type) { return type->getColumnId(); },
+                [](const orc::Type* type) { return type->getMaximumColumnId(); 
},
+                HiveOrcNestedColumnUtils::extract_nested_column_ids);
     };
 
     for (const auto* slot : tuple_descriptor->slots()) {
         auto it = table_col_pos_to_orc_type_map.find(slot->col_pos());
         if (it == table_col_pos_to_orc_type_map.end()) {
-            // Column not found in file (e.g., partition column, added column)
+            // Column not found in file
             continue;
         }
         const orc::Type* orc_field = it->second;
 
-        // primitive (non-nested) types: direct mapping by pos
+        // primitive (non-nested) types
         if ((slot->col_type() != TYPE_STRUCT && slot->col_type() != TYPE_ARRAY 
&&
              slot->col_type() != TYPE_MAP)) {
             column_ids.insert(orc_field->getColumnId());
@@ -364,14 +292,11 @@ Status HiveParquetReader::init_reader(
 
 ColumnIdResult HiveParquetReader::_create_column_ids(const FieldDescriptor* 
field_desc,
                                                      const TupleDescriptor* 
tuple_descriptor) {
-    if (!field_desc) {
-        return ColumnIdResult();
-    }
-
     // First, assign column IDs to the field descriptor
     auto* mutable_field_desc = const_cast<FieldDescriptor*>(field_desc);
     mutable_field_desc->assign_ids();
 
+    // map top-level table column name (lower-cased) -> FieldSchema*
     std::unordered_map<std::string, const FieldSchema*> 
table_col_name_to_field_schema_map;
     for (int i = 0; i < field_desc->size(); ++i) {
         auto field_schema = field_desc->get_column(i);
@@ -383,58 +308,26 @@ ColumnIdResult 
HiveParquetReader::_create_column_ids(const FieldDescriptor* fiel
     std::set<uint64_t> column_ids;
     std::set<uint64_t> filter_column_ids;
 
-    // helper to process name access paths for a given top-level parquet field
+    // helper to process access paths for a given top-level parquet field
     auto process_access_paths = [](const FieldSchema* parquet_field,
                                    const std::vector<TColumnAccessPath>& 
access_paths,
                                    std::set<uint64_t>& out_ids) {
-        bool access_paths_empty = access_paths.empty();
-
-        std::vector<std::vector<std::string>> paths;
-        bool has_top_level_only = false;
-        for (const auto& access_path : access_paths) {
-            std::vector<std::string> path;
-            if (access_path.type == TAccessPathType::DATA) {
-                path = access_path.data_access_path.path;
-            } else if (access_path.type == TAccessPathType::META) {
-                path = access_path.meta_access_path.path;
-            } else {
-                continue;
-            }
-            std::vector<std::string> remaining_path;
-            if (path.size() > 1) {
-                remaining_path.assign(path.begin() + 1, path.end());
-            } else {
-                // only top-level column name => means whole field
-                remaining_path = std::vector<std::string>();
-            }
-            if (remaining_path.empty()) {
-                has_top_level_only = true;
-            }
-            paths.push_back(std::move(remaining_path));
-        }
-
-        if (has_top_level_only || access_paths_empty) {
-            uint64_t start_id = parquet_field->get_column_id();
-            uint64_t max_column_id = parquet_field->get_max_column_id();
-            for (uint64_t id = start_id; id <= max_column_id; ++id) {
-                out_ids.insert(id);
-            }
-        } else if (!paths.empty()) {
-            out_ids.insert(parquet_field->get_column_id());
-            
HiveParquetNestedColumnUtils::extract_nested_column_ids(*parquet_field, paths, 
out_ids);
-        }
+        process_nested_access_paths(
+                parquet_field, access_paths, out_ids,
+                [](const FieldSchema* field) { return field->get_column_id(); 
},
+                [](const FieldSchema* field) { return 
field->get_max_column_id(); },
+                HiveParquetNestedColumnUtils::extract_nested_column_ids);
     };
 
     for (const auto* slot : tuple_descriptor->slots()) {
-        // Find the field schema for this slot (may not exist for partition 
columns, etc.)
         auto it = 
table_col_name_to_field_schema_map.find(slot->col_name_lower_case());
         if (it == table_col_name_to_field_schema_map.end()) {
-            // Column not found in file (e.g., partition column, added column)
+            // Column not found in file
             continue;
         }
         auto field_schema = it->second;
 
-        // primitive (non-nested) types: direct mapping by name
+        // primitive (non-nested) types
         if ((slot->col_type() != TYPE_STRUCT && slot->col_type() != TYPE_ARRAY 
&&
              slot->col_type() != TYPE_MAP)) {
             column_ids.insert(field_schema->column_id);
@@ -445,7 +338,7 @@ ColumnIdResult HiveParquetReader::_create_column_ids(const 
FieldDescriptor* fiel
             continue;
         }
 
-        // complex types:
+        // complex types
         const auto& all_access_paths = slot->all_access_paths();
         process_access_paths(field_schema, all_access_paths, column_ids);
 
@@ -460,16 +353,11 @@ ColumnIdResult 
HiveParquetReader::_create_column_ids(const FieldDescriptor* fiel
 
 ColumnIdResult HiveParquetReader::_create_column_ids_by_top_level_col_index(
         const FieldDescriptor* field_desc, const TupleDescriptor* 
tuple_descriptor) {
-    std::shared_ptr<TableSchemaChangeHelper::Node> schema_node = nullptr;
-
-    if (!field_desc) {
-        return ColumnIdResult();
-    }
-
     // First, assign column IDs to the field descriptor
     auto* mutable_field_desc = const_cast<FieldDescriptor*>(field_desc);
     mutable_field_desc->assign_ids();
 
+    // map top-level table column position -> FieldSchema*
     std::unordered_map<uint64_t, const FieldSchema*> 
table_col_pos_to_field_schema_map;
     for (int i = 0; i < field_desc->size(); ++i) {
         auto field_schema = field_desc->get_column(i);
@@ -481,58 +369,26 @@ ColumnIdResult 
HiveParquetReader::_create_column_ids_by_top_level_col_index(
     std::set<uint64_t> column_ids;
     std::set<uint64_t> filter_column_ids;
 
-    // helper to process name access paths for a given top-level parquet field
+    // helper to process access paths for a given top-level parquet field
     auto process_access_paths = [](const FieldSchema* parquet_field,
                                    const std::vector<TColumnAccessPath>& 
access_paths,
                                    std::set<uint64_t>& out_ids) {
-        bool access_paths_empty = access_paths.empty();
-
-        std::vector<std::vector<std::string>> paths;
-        bool has_top_level_only = false;
-        for (const auto& access_path : access_paths) {
-            std::vector<std::string> path;
-            if (access_path.type == TAccessPathType::DATA) {
-                path = access_path.data_access_path.path;
-            } else if (access_path.type == TAccessPathType::META) {
-                path = access_path.meta_access_path.path;
-            } else {
-                continue;
-            }
-            std::vector<std::string> remaining_path;
-            if (path.size() > 1) {
-                remaining_path.assign(path.begin() + 1, path.end());
-            } else {
-                // only top-level column name => means whole field
-                remaining_path = std::vector<std::string>();
-            }
-            if (remaining_path.empty()) {
-                has_top_level_only = true;
-            }
-            paths.push_back(std::move(remaining_path));
-        }
-
-        if (has_top_level_only || access_paths_empty) {
-            uint64_t start_id = parquet_field->get_column_id();
-            uint64_t max_column_id = parquet_field->get_max_column_id();
-            for (uint64_t id = start_id; id <= max_column_id; ++id) {
-                out_ids.insert(id);
-            }
-        } else if (!paths.empty()) {
-            out_ids.insert(parquet_field->get_column_id());
-            
HiveParquetNestedColumnUtils::extract_nested_column_ids(*parquet_field, paths, 
out_ids);
-        }
+        process_nested_access_paths(
+                parquet_field, access_paths, out_ids,
+                [](const FieldSchema* field) { return field->get_column_id(); 
},
+                [](const FieldSchema* field) { return 
field->get_max_column_id(); },
+                HiveParquetNestedColumnUtils::extract_nested_column_ids);
     };
 
     for (const auto* slot : tuple_descriptor->slots()) {
-        // Find the field schema for this slot (may not exist for partition 
columns, etc.)
         auto it = table_col_pos_to_field_schema_map.find(slot->col_pos());
         if (it == table_col_pos_to_field_schema_map.end()) {
-            // Column not found in file (e.g., partition column, added column)
+            // Column not found in file
             continue;
         }
         auto field_schema = it->second;
 
-        // primitive (non-nested) types: direct mapping by position
+        // primitive (non-nested) types
         if ((slot->col_type() != TYPE_STRUCT && slot->col_type() != TYPE_ARRAY 
&&
              slot->col_type() != TYPE_MAP)) {
             column_ids.insert(field_schema->column_id);
@@ -543,7 +399,7 @@ ColumnIdResult 
HiveParquetReader::_create_column_ids_by_top_level_col_index(
             continue;
         }
 
-        // complex types:
+        // complex types
         const auto& all_access_paths = slot->all_access_paths();
         process_access_paths(field_schema, all_access_paths, column_ids);
 
diff --git 
a/be/src/vec/exec/format/table/iceberg/iceberg_orc_nested_column_utils.cpp 
b/be/src/vec/exec/format/table/iceberg/iceberg_orc_nested_column_utils.cpp
index e3aacded188..42141589c8a 100644
--- a/be/src/vec/exec/format/table/iceberg/iceberg_orc_nested_column_utils.cpp
+++ b/be/src/vec/exec/format/table/iceberg/iceberg_orc_nested_column_utils.cpp
@@ -52,7 +52,6 @@ void IcebergOrcNestedColumnUtils::extract_nested_column_ids(
     // Normalization logic:
     //   path: ["map_col", "*"]              → ["map_col", "VALUES"] + 
["map_col", "KEYS"]
     //   path: ["map_col", "*", "field"]     → ["map_col", "VALUES", "field"] 
+ ["map_col", "KEYS"]
-    // KEYS are always needed for correct RL/DL computation when accessing MAP 
via wildcard
     if (type.getKind() == orc::TypeKind::MAP) {
         auto wildcard_it = child_paths_by_field_id.find("*");
         if (wildcard_it != child_paths_by_field_id.end()) {
@@ -62,7 +61,7 @@ void IcebergOrcNestedColumnUtils::extract_nested_column_ids(
             auto& values_paths = child_paths_by_field_id["VALUES"];
             values_paths.insert(values_paths.end(), wildcard_paths.begin(), 
wildcard_paths.end());
 
-            // Always add KEYS for wildcard access (needed for RL/DL 
computation)
+            // Always add KEYS for wildcard access
             auto& keys_paths = child_paths_by_field_id["KEYS"];
             // Add an empty path to request full KEYS
             std::vector<std::string> empty_path;
@@ -98,7 +97,7 @@ void IcebergOrcNestedColumnUtils::extract_nested_column_ids(
                 child_field_id = "VALUES";
             }
             // Special handling for Orc MAP structure:
-            // When accessing only VALUES, we still need KEY structure for 
levels
+            // When accessing only VALUES, we still need KEY structure for 
deduplicate_keys
             // Check if we're at key child (i==0) and only VALUES is requested 
(no KEYS)
             if (i == 0) {
                 bool has_keys_access =
diff --git 
a/be/src/vec/exec/format/table/iceberg/iceberg_orc_nested_column_utils.h 
b/be/src/vec/exec/format/table/iceberg/iceberg_orc_nested_column_utils.h
index 576ec598970..5a5fe33dd4f 100644
--- a/be/src/vec/exec/format/table/iceberg/iceberg_orc_nested_column_utils.h
+++ b/be/src/vec/exec/format/table/iceberg/iceberg_orc_nested_column_utils.h
@@ -17,10 +17,8 @@
 
 #pragma once
 
-#include <memory>
 #include <set>
 #include <string>
-#include <unordered_map>
 #include <vector>
 
 #include "vec/exec/format/table/table_format_reader.h"
diff --git 
a/be/src/vec/exec/format/table/iceberg/iceberg_parquet_nested_column_utils.cpp 
b/be/src/vec/exec/format/table/iceberg/iceberg_parquet_nested_column_utils.cpp
index 7741d498f99..48c508390ea 100644
--- 
a/be/src/vec/exec/format/table/iceberg/iceberg_parquet_nested_column_utils.cpp
+++ 
b/be/src/vec/exec/format/table/iceberg/iceberg_parquet_nested_column_utils.cpp
@@ -8,7 +8,7 @@
 //
 //   http://www.apache.org/licenses/LICENSE-2.0
 //
-// Unless required by applicable law or agreed in writing,
+// Unless required by applicable law or agreed to in writing,
 // software distributed under the License is distributed on an
 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 // KIND, either express or implied.  See the License for the
@@ -56,7 +56,6 @@ void 
IcebergParquetNestedColumnUtils::extract_nested_column_ids(
     // Normalization logic:
     //   path: ["map_col", "*"]              → ["map_col", "VALUES"] + 
["map_col", "KEYS"]
     //   path: ["map_col", "*", "field"]     → ["map_col", "VALUES", "field"] 
+ ["map_col", "KEYS"]
-    // KEYS are always needed for correct RL/DL computation when accessing MAP 
via wildcard
     if (field_schema.data_type->get_primitive_type() == 
PrimitiveType::TYPE_MAP) {
         auto wildcard_it = child_paths_by_field_id.find("*");
         if (wildcard_it != child_paths_by_field_id.end()) {
@@ -66,7 +65,7 @@ void 
IcebergParquetNestedColumnUtils::extract_nested_column_ids(
             auto& values_paths = child_paths_by_field_id["VALUES"];
             values_paths.insert(values_paths.end(), wildcard_paths.begin(), 
wildcard_paths.end());
 
-            // Always add KEYS for wildcard access (needed for RL/DL 
computation)
+            // Always add KEYS for wildcard access
             auto& keys_paths = child_paths_by_field_id["KEYS"];
             // Add an empty path to request full KEYS
             std::vector<std::string> empty_path;
@@ -114,10 +113,7 @@ void 
IcebergParquetNestedColumnUtils::extract_nested_column_ids(
                     uint64_t key_start_id = child.get_column_id();
                     uint64_t key_max_id = child.get_max_column_id();
                     for (uint64_t id = key_start_id; id <= key_max_id; ++id) {
-                        auto inserted = column_ids.insert(id);
-                        if (inserted.second) {
-                            std::cout << "[IcebergNested] added column id: " 
<< id << std::endl;
-                        }
+                        column_ids.insert(id);
                     }
                     has_child_columns = true;
                     continue; // Skip further processing of key child
@@ -147,10 +143,7 @@ void 
IcebergParquetNestedColumnUtils::extract_nested_column_ids(
                 uint64_t start_id = child.get_column_id();
                 uint64_t max_column_id = child.get_max_column_id();
                 for (uint64_t id = start_id; id <= max_column_id; ++id) {
-                    auto inserted = column_ids.insert(id);
-                    if (inserted.second) {
-                        std::cout << "[IcebergNested] added column id: " << id 
<< std::endl;
-                    }
+                    column_ids.insert(id);
                 }
                 has_child_columns = true;
             } else {
@@ -172,11 +165,7 @@ void 
IcebergParquetNestedColumnUtils::extract_nested_column_ids(
     // This ensures parent struct/container nodes are included when their 
children are needed
     if (has_child_columns) {
         // Set automatically handles deduplication, so no need to check if it 
already exists
-        auto inserted = column_ids.insert(field_schema.get_column_id());
-        if (inserted.second) {
-            std::cout << "[IcebergNested] added parent column id: " << 
field_schema.get_column_id()
-                      << std::endl;
-        }
+        column_ids.insert(field_schema.get_column_id());
     }
 }
 
diff --git a/be/src/vec/exec/format/table/iceberg_reader.cpp 
b/be/src/vec/exec/format/table/iceberg_reader.cpp
index 1a010dc4454..4c8a8a584db 100644
--- a/be/src/vec/exec/format/table/iceberg_reader.cpp
+++ b/be/src/vec/exec/format/table/iceberg_reader.cpp
@@ -55,6 +55,7 @@
 #include "vec/exec/format/parquet/vparquet_column_chunk_reader.h"
 #include "vec/exec/format/table/iceberg/iceberg_orc_nested_column_utils.h"
 #include "vec/exec/format/table/iceberg/iceberg_parquet_nested_column_utils.h"
+#include "vec/exec/format/table/nested_column_access_helper.h"
 #include "vec/exec/format/table/table_format_reader.h"
 
 namespace cctz {
@@ -450,7 +451,6 @@ Status IcebergParquetReader::init_reader(
     _all_required_col_names = file_col_names;
 
     auto column_id_result = _create_column_ids(field_desc, tuple_descriptor);
-    // const auto& file_col_names = column_id_result.column_names;
     auto& column_ids = column_id_result.column_ids;
     const auto& filter_column_ids = column_id_result.filter_column_ids;
 
@@ -471,15 +471,11 @@ Status IcebergParquetReader::init_reader(
 
 ColumnIdResult IcebergParquetReader::_create_column_ids(const FieldDescriptor* 
field_desc,
                                                         const TupleDescriptor* 
tuple_descriptor) {
-    if (!field_desc) {
-        return ColumnIdResult();
-    }
-
     // First, assign column IDs to the field descriptor
     auto* mutable_field_desc = const_cast<FieldDescriptor*>(field_desc);
     mutable_field_desc->assign_ids();
 
-    // Create a mapping from iceberg field_id to FieldSchema for quick lookup
+    // map top-level table column iceberg_id -> FieldSchema*
     std::unordered_map<int, const FieldSchema*> iceberg_id_to_field_schema_map;
 
     for (int i = 0; i < field_desc->size(); ++i) {
@@ -493,47 +489,15 @@ ColumnIdResult 
IcebergParquetReader::_create_column_ids(const FieldDescriptor* f
     std::set<uint64_t> column_ids;
     std::set<uint64_t> filter_column_ids;
 
-    // helper to process name access paths for a given top-level parquet field
+    // helper to process access paths for a given top-level parquet field
     auto process_access_paths = [](const FieldSchema* parquet_field,
                                    const std::vector<TColumnAccessPath>& 
access_paths,
                                    std::set<uint64_t>& out_ids) {
-        bool access_paths_empty = access_paths.empty();
-
-        std::vector<std::vector<std::string>> paths;
-        bool has_top_level_only = false;
-        for (const auto& access_path : access_paths) {
-            std::vector<std::string> path;
-            if (access_path.type == TAccessPathType::DATA) {
-                path = access_path.data_access_path.path;
-            } else if (access_path.type == TAccessPathType::META) {
-                path = access_path.meta_access_path.path;
-            } else {
-                continue;
-            }
-            std::vector<std::string> remaining_path;
-            if (path.size() > 1) {
-                remaining_path.assign(path.begin() + 1, path.end());
-            } else {
-                // only top-level column name => means whole field
-                remaining_path = std::vector<std::string>();
-            }
-            if (remaining_path.empty()) {
-                has_top_level_only = true;
-            }
-            paths.push_back(std::move(remaining_path));
-        }
-
-        if (has_top_level_only || access_paths_empty) {
-            uint64_t start_id = parquet_field->get_column_id();
-            uint64_t max_column_id = parquet_field->get_max_column_id();
-            for (uint64_t id = start_id; id <= max_column_id; ++id) {
-                out_ids.insert(id);
-            }
-        } else if (!paths.empty()) {
-            out_ids.insert(parquet_field->get_column_id());
-            
IcebergParquetNestedColumnUtils::extract_nested_column_ids(*parquet_field, 
paths,
-                                                                       
out_ids);
-        }
+        process_nested_access_paths(
+                parquet_field, access_paths, out_ids,
+                [](const FieldSchema* field) { return field->get_column_id(); 
},
+                [](const FieldSchema* field) { return 
field->get_max_column_id(); },
+                IcebergParquetNestedColumnUtils::extract_nested_column_ids);
     };
 
     for (const auto* slot : tuple_descriptor->slots()) {
@@ -663,13 +627,7 @@ Status IcebergOrcReader::init_reader(
 
 ColumnIdResult IcebergOrcReader::_create_column_ids(const orc::Type* orc_type,
                                                     const TupleDescriptor* 
tuple_descriptor) {
-    std::shared_ptr<TableSchemaChangeHelper::Node> schema_node = nullptr;
-
-    if (!orc_type) {
-        return ColumnIdResult();
-    }
-
-    // map top-level table column name (lower-cased) -> orc::Type*
+    // map top-level table column iceberg_id -> orc::Type*
     std::unordered_map<int, const orc::Type*> iceberg_id_to_orc_type_map;
     for (uint64_t i = 0; i < orc_type->getSubtypeCount(); ++i) {
         auto orc_sub_type = orc_type->getSubtype(i);
@@ -685,57 +643,26 @@ ColumnIdResult IcebergOrcReader::_create_column_ids(const 
orc::Type* orc_type,
     std::set<uint64_t> column_ids;
     std::set<uint64_t> filter_column_ids;
 
-    // helper to process name access paths for a given top-level orc field
+    // helper to process access paths for a given top-level orc field
     auto process_access_paths = [](const orc::Type* orc_field,
                                    const std::vector<TColumnAccessPath>& 
access_paths,
                                    std::set<uint64_t>& out_ids) {
-        bool access_paths_empty = access_paths.empty();
-
-        std::vector<std::vector<std::string>> paths;
-        bool has_top_level_only = false;
-        for (const auto& access_path : access_paths) {
-            std::vector<std::string> path;
-            if (access_path.type == TAccessPathType::DATA) {
-                path = access_path.data_access_path.path;
-            } else if (access_path.type == TAccessPathType::META) {
-                path = access_path.meta_access_path.path;
-            } else {
-                continue;
-            }
-            std::vector<std::string> remaining_path;
-            if (path.size() > 1) {
-                remaining_path.assign(path.begin() + 1, path.end());
-            } else {
-                // only top-level column name => means whole field
-                remaining_path = std::vector<std::string>();
-            }
-            if (remaining_path.empty()) {
-                has_top_level_only = true;
-            }
-            paths.push_back(std::move(remaining_path));
-        }
-
-        if (has_top_level_only || access_paths_empty) {
-            uint64_t start_id = orc_field->getColumnId();
-            uint64_t max_column_id = orc_field->getMaximumColumnId();
-            for (uint64_t id = start_id; id <= max_column_id; ++id) {
-                out_ids.insert(id);
-            }
-        } else if (!paths.empty()) {
-            out_ids.insert(orc_field->getColumnId());
-            IcebergOrcNestedColumnUtils::extract_nested_column_ids(*orc_field, 
paths, out_ids);
-        }
+        process_nested_access_paths(
+                orc_field, access_paths, out_ids,
+                [](const orc::Type* type) { return type->getColumnId(); },
+                [](const orc::Type* type) { return type->getMaximumColumnId(); 
},
+                IcebergOrcNestedColumnUtils::extract_nested_column_ids);
     };
 
     for (const auto* slot : tuple_descriptor->slots()) {
         auto it = iceberg_id_to_orc_type_map.find(slot->col_unique_id());
         if (it == iceberg_id_to_orc_type_map.end()) {
-            // Column not found in file (e.g., partition column, added column)
+            // Column not found in file
             continue;
         }
         const orc::Type* orc_field = it->second;
 
-        // primitive (non-nested) types: direct mapping by name
+        // primitive (non-nested) types
         if ((slot->col_type() != TYPE_STRUCT && slot->col_type() != TYPE_ARRAY 
&&
              slot->col_type() != TYPE_MAP)) {
             column_ids.insert(orc_field->getColumnId());
@@ -745,7 +672,7 @@ ColumnIdResult IcebergOrcReader::_create_column_ids(const 
orc::Type* orc_type,
             continue;
         }
 
-        // complex types:
+        // complex types
         const auto& all_access_paths = slot->all_access_paths();
         process_access_paths(orc_field, all_access_paths, column_ids);
 
diff --git a/be/src/vec/exec/format/table/nested_column_access_helper.h 
b/be/src/vec/exec/format/table/nested_column_access_helper.h
new file mode 100644
index 00000000000..316cfb7aebe
--- /dev/null
+++ b/be/src/vec/exec/format/table/nested_column_access_helper.h
@@ -0,0 +1,82 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <set>
+#include <string>
+#include <vector>
+
+#include "vec/exec/format/table/table_format_reader.h"
+
+namespace doris::vectorized {
+#include "common/compile_check_begin.h"
+
+// Helper that normalizes access paths and delegates nested column id 
extraction.
+// The caller provides how to access the column id range for the concrete 
field type
+// (Parquet FieldSchema, ORC Type, etc.) plus a nested extractor 
implementation.
+template <typename FieldType, typename ColumnIdGetter, typename 
MaxColumnIdGetter,
+          typename ExtractNestedFunc>
+void process_nested_access_paths(const FieldType* field,
+                                 const std::vector<TColumnAccessPath>& 
access_paths,
+                                 std::set<uint64_t>& out_ids, ColumnIdGetter&& 
column_id_getter,
+                                 MaxColumnIdGetter&& max_column_id_getter,
+                                 ExtractNestedFunc&& extract_nested) {
+    if (field == nullptr) {
+        return;
+    }
+
+    const bool access_paths_empty = access_paths.empty();
+    std::vector<std::vector<std::string>> paths;
+    paths.reserve(access_paths.size());
+    bool has_top_level_only = false;
+
+    for (const auto& access_path : access_paths) {
+        const std::vector<std::string>* path_ptr = nullptr;
+        if (access_path.type == TAccessPathType::DATA) {
+            path_ptr = &access_path.data_access_path.path;
+        } else if (access_path.type == TAccessPathType::META) {
+            path_ptr = &access_path.meta_access_path.path;
+        } else {
+            continue;
+        }
+
+        const auto& path = *path_ptr;
+        std::vector<std::string> remaining_path;
+        if (path.size() > 1) {
+            remaining_path.assign(path.begin() + 1, path.end());
+        }
+        if (remaining_path.empty()) {
+            has_top_level_only = true;
+        }
+        paths.push_back(std::move(remaining_path));
+    }
+
+    const uint64_t column_id = column_id_getter(field);
+    if (has_top_level_only || access_paths_empty) {
+        const uint64_t max_column_id = max_column_id_getter(field);
+        for (uint64_t id = column_id; id <= max_column_id; ++id) {
+            out_ids.insert(id);
+        }
+    } else if (!paths.empty()) {
+        out_ids.insert(column_id);
+        extract_nested(*field, paths, out_ids);
+    }
+}
+
+#include "common/compile_check_end.h"
+} // namespace doris::vectorized


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(doris) 01/01: code refactoring for external table.

Reply via email to