wgtmac commented on code in PR #727:
URL: https://github.com/apache/iceberg-cpp/pull/727#discussion_r3411207533
##########
src/iceberg/parquet/parquet_writer.cc:
##########
@@ -74,6 +82,144 @@ Status CheckCompressionAvailable(std::string_view
compression_name,
return {};
}
+template <typename ArrowArrayType, typename ValueType>
+Status UpdateFloatingFieldMetrics(int32_t field_id, const ::arrow::Array&
arrow_array,
+ const std::vector<uint8_t>* valid_rows,
+ std::unordered_map<int32_t, FieldMetrics>&
metrics) {
+ constexpr auto expected_type_id =
+ std::is_same_v<ValueType, float> ? ::arrow::Type::FLOAT :
::arrow::Type::DOUBLE;
+ ICEBERG_PRECHECK(arrow_array.type_id() == expected_type_id,
+ "Expected Arrow floating-point array for field metrics
collection");
+ const auto& array = static_cast<const ArrowArrayType&>(arrow_array);
+ auto& field_metrics = metrics[field_id];
+ field_metrics.field_id = field_id;
+ if (field_metrics.value_count < 0) {
+ field_metrics.value_count = 0;
+ }
+ if (field_metrics.null_value_count < 0) {
+ field_metrics.null_value_count = 0;
+ }
+ if (field_metrics.nan_value_count < 0) {
+ field_metrics.nan_value_count = 0;
+ }
+
+ field_metrics.value_count += array.length();
+
+ for (int64_t i = 0; i < array.length(); ++i) {
+ if ((valid_rows != nullptr && (*valid_rows)[i] == 0) || array.IsNull(i)) {
+ ++field_metrics.null_value_count;
+ continue;
+ }
+
+ ValueType value = array.Value(i);
+ if (std::isnan(value)) {
+ ++field_metrics.nan_value_count;
+ continue;
+ }
+
+ auto literal = [&]() {
+ if constexpr (std::is_same_v<ValueType, float>) {
+ return Literal::Float(value);
+ } else {
+ return Literal::Double(value);
+ }
+ }();
+ if (!field_metrics.lower_bound.has_value() ||
+ literal < field_metrics.lower_bound.value()) {
+ field_metrics.lower_bound = literal;
+ }
+ if (!field_metrics.upper_bound.has_value() ||
+ literal > field_metrics.upper_bound.value()) {
+ field_metrics.upper_bound = std::move(literal);
+ }
+ }
+
+ return {};
+}
+
+std::optional<std::vector<uint8_t>> BuildValidRows(const ::arrow::Array& array,
Review Comment:
Non-blocking: this could be made cheaper by carrying Arrow validity bitmaps
instead of materializing a `std::vector<uint8_t>` and calling `array.IsNull(i)`
for every row at each struct level. The main detail is that this needs to
preserve the current parent-validity behavior: child arrays of a null
`StructArray` do not necessarily have those parent nulls reflected in their own
null bitmap, so the effective validity should be `parent_validity AND
array.null_bitmap()`.
One possible approach is to keep a bitmap/buffer plus offset for the current
effective validity, and when descending into a field, use Arrow bitmap
utilities to AND it with the child array validity bitmap (or just reuse the
parent bitmap when the child has `null_count() == 0`). Then the float/double
collector can iterate only the effective bitmap bits instead of indexing a byte
vector. This should avoid per-level byte-vector allocation and reduce the
per-row validity checks for wide/deep struct writes.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]