wgtmac commented on code in PR #727:
URL: https://github.com/apache/iceberg-cpp/pull/727#discussion_r3411207533


##########
src/iceberg/parquet/parquet_writer.cc:
##########
@@ -74,6 +82,144 @@ Status CheckCompressionAvailable(std::string_view 
compression_name,
   return {};
 }
 
+template <typename ArrowArrayType, typename ValueType>
+Status UpdateFloatingFieldMetrics(int32_t field_id, const ::arrow::Array& 
arrow_array,
+                                  const std::vector<uint8_t>* valid_rows,
+                                  std::unordered_map<int32_t, FieldMetrics>& 
metrics) {
+  constexpr auto expected_type_id =
+      std::is_same_v<ValueType, float> ? ::arrow::Type::FLOAT : 
::arrow::Type::DOUBLE;
+  ICEBERG_PRECHECK(arrow_array.type_id() == expected_type_id,
+                   "Expected Arrow floating-point array for field metrics 
collection");
+  const auto& array = static_cast<const ArrowArrayType&>(arrow_array);
+  auto& field_metrics = metrics[field_id];
+  field_metrics.field_id = field_id;
+  if (field_metrics.value_count < 0) {
+    field_metrics.value_count = 0;
+  }
+  if (field_metrics.null_value_count < 0) {
+    field_metrics.null_value_count = 0;
+  }
+  if (field_metrics.nan_value_count < 0) {
+    field_metrics.nan_value_count = 0;
+  }
+
+  field_metrics.value_count += array.length();
+
+  for (int64_t i = 0; i < array.length(); ++i) {
+    if ((valid_rows != nullptr && (*valid_rows)[i] == 0) || array.IsNull(i)) {
+      ++field_metrics.null_value_count;
+      continue;
+    }
+
+    ValueType value = array.Value(i);
+    if (std::isnan(value)) {
+      ++field_metrics.nan_value_count;
+      continue;
+    }
+
+    auto literal = [&]() {
+      if constexpr (std::is_same_v<ValueType, float>) {
+        return Literal::Float(value);
+      } else {
+        return Literal::Double(value);
+      }
+    }();
+    if (!field_metrics.lower_bound.has_value() ||
+        literal < field_metrics.lower_bound.value()) {
+      field_metrics.lower_bound = literal;
+    }
+    if (!field_metrics.upper_bound.has_value() ||
+        literal > field_metrics.upper_bound.value()) {
+      field_metrics.upper_bound = std::move(literal);
+    }
+  }
+
+  return {};
+}
+
+std::optional<std::vector<uint8_t>> BuildValidRows(const ::arrow::Array& array,

Review Comment:
   Non-blocking: this could be made cheaper by carrying Arrow validity bitmaps 
instead of materializing a `std::vector<uint8_t>` and calling `array.IsNull(i)` 
for every row at each struct level. The main detail is that this needs to 
preserve the current parent-validity behavior: child arrays of a null 
`StructArray` do not necessarily have those parent nulls reflected in their own 
null bitmap, so the effective validity should be `parent_validity AND 
array.null_bitmap()`.
   
   One possible approach is to keep a bitmap/buffer plus offset for the current 
effective validity, and when descending into a field, use Arrow bitmap 
utilities to AND it with the child array validity bitmap (or just reuse the 
parent bitmap when the child has `null_count() == 0`). Then the float/double 
collector can iterate only the effective bitmap bits instead of indexing a byte 
vector. This should avoid per-level byte-vector allocation and reduce the 
per-row validity checks for wide/deep struct writes.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to