This is an automated email from the ASF dual-hosted git repository.

morningman pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new dca16796ad [fix](ParquetReader) definition level of repeated parent is 
wrong (#17337)
dca16796ad is described below

commit dca16796adfac43f5bbcf74df33d212c2bcf8510
Author: Ashin Gau <ashin...@users.noreply.github.com>
AuthorDate: Mon Mar 6 18:15:57 2023 +0800

    [fix](ParquetReader) definition level of repeated parent is wrong (#17337)
    
    Fix three bugs:
    1.  `repeated_parent_def_level ` should be the definition of its repeated 
parent.
    2. Failed to parse schema like `decimal(p, s)`
    3. Fill wrong offsets for array type
---
 be/src/vec/exec/format/parquet/schema_desc.cpp     | 22 +++++++++++-----------
 .../exec/format/parquet/vparquet_column_reader.cpp |  8 ++++----
 .../doris/catalog/HiveMetaStoreClientHelper.java   |  7 ++++++-
 3 files changed, 21 insertions(+), 16 deletions(-)

diff --git a/be/src/vec/exec/format/parquet/schema_desc.cpp 
b/be/src/vec/exec/format/parquet/schema_desc.cpp
index c7ff2353d4..ebd9a25352 100644
--- a/be/src/vec/exec/format/parquet/schema_desc.cpp
+++ b/be/src/vec/exec/format/parquet/schema_desc.cpp
@@ -55,11 +55,11 @@ static int num_children_node(const tparquet::SchemaElement& 
schema) {
     return schema.__isset.num_children ? schema.num_children : 0;
 }
 
-static void set_child_node_level(FieldSchema* parent, size_t rep_inc = 0, 
size_t def_inc = 0) {
+static void set_child_node_level(FieldSchema* parent, int16_t 
repeated_parent_def_level) {
     for (auto& child : parent->children) {
-        child.repetition_level = parent->repetition_level + rep_inc;
-        child.definition_level = parent->definition_level + def_inc;
-        child.repeated_parent_def_level = parent->definition_level;
+        child.repetition_level = parent->repetition_level;
+        child.definition_level = parent->definition_level;
+        child.repeated_parent_def_level = repeated_parent_def_level;
     }
 }
 
@@ -129,7 +129,7 @@ Status FieldDescriptor::parse_node_field(const 
std::vector<tparquet::SchemaEleme
         node_field->repetition_level++;
         node_field->definition_level++;
         node_field->children.resize(1);
-        set_child_node_level(node_field);
+        set_child_node_level(node_field, node_field->definition_level);
         auto child = &node_field->children[0];
         parse_physical_field(t_schema, false, child);
 
@@ -315,7 +315,7 @@ Status FieldDescriptor::parse_group_field(const 
std::vector<tparquet::SchemaElem
         group_field->repetition_level++;
         group_field->definition_level++;
         group_field->children.resize(1);
-        set_child_node_level(group_field);
+        set_child_node_level(group_field, group_field->definition_level);
         auto struct_field = &group_field->children[0];
         // the list of struct:
         // repeated group <name> (LIST) {
@@ -379,16 +379,16 @@ Status FieldDescriptor::parse_list_field(const 
std::vector<tparquet::SchemaEleme
             // optional field, and the third level element is the nested 
structure in list
             // produce nested structure like: LIST<INT>, LIST<MAP>, 
LIST<LIST<...>>
             // skip bag/list, it's a repeated element.
-            set_child_node_level(list_field);
+            set_child_node_level(list_field, list_field->definition_level);
             RETURN_IF_ERROR(parse_node_field(t_schemas, curr_pos + 2, 
list_child));
         } else {
             // required field, produce the list of struct
-            set_child_node_level(list_field);
+            set_child_node_level(list_field, list_field->definition_level);
             RETURN_IF_ERROR(parse_struct_field(t_schemas, curr_pos + 1, 
list_child));
         }
     } else if (num_children == 0) {
         // required two level list, for compatibility reason.
-        set_child_node_level(list_field);
+        set_child_node_level(list_field, list_field->definition_level);
         parse_physical_field(second_level, false, list_child);
         _next_schema_pos = curr_pos + 2;
     }
@@ -450,7 +450,7 @@ Status FieldDescriptor::parse_map_field(const 
std::vector<tparquet::SchemaElemen
     map_field->definition_level++;
 
     map_field->children.resize(1);
-    set_child_node_level(map_field);
+    set_child_node_level(map_field, map_field->repeated_parent_def_level);
     auto map_kv_field = &map_field->children[0];
     // produce MAP<STRUCT<KEY, VALUE>>
     RETURN_IF_ERROR(parse_struct_field(t_schemas, curr_pos + 1, map_kv_field));
@@ -473,7 +473,7 @@ Status FieldDescriptor::parse_struct_field(const 
std::vector<tparquet::SchemaEle
     }
     auto num_children = struct_schema.num_children;
     struct_field->children.resize(num_children);
-    set_child_node_level(struct_field);
+    set_child_node_level(struct_field, 
struct_field->repeated_parent_def_level);
     _next_schema_pos = curr_pos + 1;
     for (int i = 0; i < num_children; ++i) {
         RETURN_IF_ERROR(parse_node_field(t_schemas, _next_schema_pos, 
&struct_field->children[i]));
diff --git a/be/src/vec/exec/format/parquet/vparquet_column_reader.cpp 
b/be/src/vec/exec/format/parquet/vparquet_column_reader.cpp
index 4e270c30cf..3e94a3082f 100644
--- a/be/src/vec/exec/format/parquet/vparquet_column_reader.cpp
+++ b/be/src/vec/exec/format/parquet/vparquet_column_reader.cpp
@@ -40,7 +40,7 @@ static void fill_struct_null_map(FieldSchema* field, NullMap& 
null_map,
     DCHECK_EQ(num_levels, rep_levels.size());
     size_t origin_size = null_map.size();
     null_map.resize(origin_size + num_levels);
-    size_t pos = 0;
+    size_t pos = origin_size;
     for (size_t i = 0; i < num_levels; ++i) {
         // skip the levels affect its ancestor or its descendants
         if (def_levels[i] < field->repeated_parent_def_level ||
@@ -53,7 +53,7 @@ static void fill_struct_null_map(FieldSchema* field, NullMap& 
null_map,
             null_map[pos++] = 1;
         }
     }
-    null_map.resize(origin_size + pos);
+    null_map.resize(pos + 1);
 }
 
 static void fill_array_offset(FieldSchema* field, ColumnArray::Offsets64& 
offsets_data,
@@ -88,9 +88,9 @@ static void fill_array_offset(FieldSchema* field, 
ColumnArray::Offsets64& offset
             (*null_map_ptr)[offset_pos] = 1;
         }
     }
-    offsets_data.resize(origin_size + offset_pos + 1);
+    offsets_data.resize(offset_pos + 1);
     if (null_map_ptr != nullptr) {
-        null_map_ptr->resize(origin_size + offset_pos + 1);
+        null_map_ptr->resize(offset_pos + 1);
     }
 }
 
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/catalog/HiveMetaStoreClientHelper.java
 
b/fe/fe-core/src/main/java/org/apache/doris/catalog/HiveMetaStoreClientHelper.java
index 94370f3129..53b492082f 100644
--- 
a/fe/fe-core/src/main/java/org/apache/doris/catalog/HiveMetaStoreClientHelper.java
+++ 
b/fe/fe-core/src/main/java/org/apache/doris/catalog/HiveMetaStoreClientHelper.java
@@ -676,13 +676,18 @@ public class HiveMetaStoreClientHelper {
      */
     private static int findNextNestedField(String commaSplitFields) {
         int numLess = 0;
+        int numBracket = 0;
         for (int i = 0; i < commaSplitFields.length(); i++) {
             char c = commaSplitFields.charAt(i);
             if (c == '<') {
                 numLess++;
             } else if (c == '>') {
                 numLess--;
-            } else if (c == ',' && numLess == 0) {
+            } else if (c == '(') {
+                numBracket++;
+            } else if (c == ')') {
+                numBracket--;
+            } else if (c == ',' && numLess == 0 && numBracket == 0) {
                 return i;
             }
         }


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to