Re: [PR] feat(spec): Expose DataFile serde functions [iceberg-rust]

via GitHub Thu, 31 Jul 2025 23:54:41 -0700


liurenjie1024 commented on code in PR #1563:
URL: https://github.com/apache/iceberg-rust/pull/1563#discussion_r2247121459



##########
crates/iceberg/src/spec/manifest/mod.rs:
##########
@@ -1056,4 +1093,155 @@ mod tests {
         assert!(!partitions[2].clone().contains_null);
         assert_eq!(partitions[2].clone().contains_nan, Some(false));
     }
+
+    #[test]
+    fn test_data_file_serialization() {
+        // Create a simple schema
+        let schema = Schema::builder()
+            .with_schema_id(1)
+            .with_identifier_field_ids(vec![1])
+            .with_fields(vec![
+                NestedField::required(1, "id", 
Type::Primitive(PrimitiveType::Long)).into(),
+                NestedField::required(2, "name", 
Type::Primitive(PrimitiveType::String)).into(),
+            ])
+            .build()
+            .unwrap();
+
+        // Create a partition spec
+        let partition_spec = PartitionSpec::builder(schema.clone())
+            .with_spec_id(1)
+            .add_partition_field("id", "id_partition", Transform::Identity)
+            .unwrap()
+            .build()
+            .unwrap();
+
+        // Get partition type from the partition spec
+        let partition_type = partition_spec.partition_type(&schema).unwrap();
+
+        // Create a vector of DataFile objects
+        let data_files = vec![
+            DataFileBuilder::default()
+                .content(DataContentType::Data)
+                .file_format(DataFileFormat::Parquet)
+                .file_path("path/to/file1.parquet".to_string())
+                .file_size_in_bytes(1024)
+                .record_count(100)
+                .partition_spec_id(1)
+                .partition(Struct::empty())
+                .column_sizes(HashMap::from([(1, 512)]))
+                .value_counts(HashMap::from([(1, 100)]))
+                .null_value_counts(HashMap::from([(1, 0)]))
+                .build()
+                .unwrap(),
+            DataFileBuilder::default()
+                .content(DataContentType::Data)
+                .file_format(DataFileFormat::Parquet)
+                .file_path("path/to/file2.parquet".to_string())
+                .file_size_in_bytes(2048)
+                .record_count(200)
+                .partition_spec_id(1)
+                .partition(Struct::empty())
+                .column_sizes(HashMap::from([(1, 1024)]))
+                .value_counts(HashMap::from([(1, 200)]))
+                .null_value_counts(HashMap::from([(1, 10)]))
+                .build()
+                .unwrap(),
+        ];
+
+        // Serialize the DataFile objects
+        let serialized_files = data_files
+            .into_iter()
+            .map(|f| serialize_data_file_to_json(f, &partition_type, 
FormatVersion::V2).unwrap())
+            .collect::<Vec<String>>();
+
+        // Verify we have the expected serialized files
+        assert_eq!(serialized_files.len(), 2);
+        let pretty_json1: Value = 
serde_json::from_str(&serialized_files.get(0).unwrap()).unwrap();
+        let pretty_json2: Value = 
serde_json::from_str(&serialized_files.get(1).unwrap()).unwrap();
+        let expected_serialized_file1 = serde_json::json!({
+            "content": 0,
+            "file_path": "path/to/file1.parquet",
+            "file_format": "PARQUET",
+            "partition": {},
+            "record_count": 100,
+            "file_size_in_bytes": 1024,
+            "column_sizes": [
+                { "key": 1, "value": 512 }
+            ],
+            "value_counts": [
+                { "key": 1, "value": 100 }
+            ],
+            "null_value_counts": [
+                { "key": 1, "value": 0 }
+            ],
+            "nan_value_counts": [],
+            "lower_bounds": [],
+            "upper_bounds": [],
+            "key_metadata": null,
+            "split_offsets": [],
+            "equality_ids": [],
+            "sort_order_id": null,
+            "first_row_id": null,
+            "referenced_data_file": null,
+            "content_offset": null,
+            "content_size_in_bytes": null
+        });
+        let expected_serialized_file2 = serde_json::json!({
+            "content": 0,
+            "file_path": "path/to/file2.parquet",
+            "file_format": "PARQUET",
+            "partition": {},
+            "record_count": 200,
+            "file_size_in_bytes": 2048,
+            "column_sizes": [
+                { "key": 1, "value": 1024 }
+            ],
+            "value_counts": [
+                { "key": 1, "value": 200 }
+            ],
+            "null_value_counts": [
+                { "key": 1, "value": 10 }
+            ],
+            "nan_value_counts": [],
+            "lower_bounds": [],
+            "upper_bounds": [],
+            "key_metadata": null,
+            "split_offsets": [],
+            "equality_ids": [],
+            "sort_order_id": null,
+            "first_row_id": null,
+            "referenced_data_file": null,
+            "content_offset": null,
+            "content_size_in_bytes": null
+        });

Review Comment:
   >Also I found a tricky corner case: DataFile's serializer won't preserve the 
order of DataFile's array like column_sizes. Right now I just make the array 
sizes to be one to work around that, maybe there is a better solution?
   
   I guess the randomness comes from `HashMap`, but since now you use 
`serde_json::Value` to do the check, I guess the order doesn't matter?



##########
crates/iceberg/src/spec/manifest/mod.rs:
##########
@@ -1056,4 +1093,155 @@ mod tests {
         assert!(!partitions[2].clone().contains_null);
         assert_eq!(partitions[2].clone().contains_nan, Some(false));
     }
+
+    #[test]
+    fn test_data_file_serialization() {
+        // Create a simple schema
+        let schema = Schema::builder()
+            .with_schema_id(1)
+            .with_identifier_field_ids(vec![1])
+            .with_fields(vec![
+                NestedField::required(1, "id", 
Type::Primitive(PrimitiveType::Long)).into(),
+                NestedField::required(2, "name", 
Type::Primitive(PrimitiveType::String)).into(),
+            ])
+            .build()
+            .unwrap();
+
+        // Create a partition spec
+        let partition_spec = PartitionSpec::builder(schema.clone())
+            .with_spec_id(1)
+            .add_partition_field("id", "id_partition", Transform::Identity)
+            .unwrap()
+            .build()
+            .unwrap();
+
+        // Get partition type from the partition spec
+        let partition_type = partition_spec.partition_type(&schema).unwrap();
+
+        // Create a vector of DataFile objects
+        let data_files = vec![
+            DataFileBuilder::default()
+                .content(DataContentType::Data)
+                .file_format(DataFileFormat::Parquet)
+                .file_path("path/to/file1.parquet".to_string())
+                .file_size_in_bytes(1024)
+                .record_count(100)
+                .partition_spec_id(1)
+                .partition(Struct::empty())
+                .column_sizes(HashMap::from([(1, 512)]))
+                .value_counts(HashMap::from([(1, 100)]))
+                .null_value_counts(HashMap::from([(1, 0)]))
+                .build()
+                .unwrap(),
+            DataFileBuilder::default()
+                .content(DataContentType::Data)
+                .file_format(DataFileFormat::Parquet)
+                .file_path("path/to/file2.parquet".to_string())
+                .file_size_in_bytes(2048)
+                .record_count(200)
+                .partition_spec_id(1)
+                .partition(Struct::empty())
+                .column_sizes(HashMap::from([(1, 1024)]))
+                .value_counts(HashMap::from([(1, 200)]))
+                .null_value_counts(HashMap::from([(1, 10)]))
+                .build()
+                .unwrap(),
+        ];
+
+        // Serialize the DataFile objects
+        let serialized_files = data_files
+            .into_iter()
+            .map(|f| serialize_data_file_to_json(f, &partition_type, 
FormatVersion::V2).unwrap())
+            .collect::<Vec<String>>();
+
+        // Verify we have the expected serialized files
+        assert_eq!(serialized_files.len(), 2);
+        let pretty_json1: Value = 
serde_json::from_str(&serialized_files.get(0).unwrap()).unwrap();
+        let pretty_json2: Value = 
serde_json::from_str(&serialized_files.get(1).unwrap()).unwrap();
+        let expected_serialized_file1 = serde_json::json!({
+            "content": 0,
+            "file_path": "path/to/file1.parquet",
+            "file_format": "PARQUET",
+            "partition": {},
+            "record_count": 100,
+            "file_size_in_bytes": 1024,
+            "column_sizes": [
+                { "key": 1, "value": 512 }
+            ],
+            "value_counts": [
+                { "key": 1, "value": 100 }
+            ],
+            "null_value_counts": [
+                { "key": 1, "value": 0 }
+            ],
+            "nan_value_counts": [],
+            "lower_bounds": [],
+            "upper_bounds": [],
+            "key_metadata": null,
+            "split_offsets": [],
+            "equality_ids": [],
+            "sort_order_id": null,
+            "first_row_id": null,
+            "referenced_data_file": null,
+            "content_offset": null,
+            "content_size_in_bytes": null
+        });
+        let expected_serialized_file2 = serde_json::json!({
+            "content": 0,
+            "file_path": "path/to/file2.parquet",
+            "file_format": "PARQUET",
+            "partition": {},
+            "record_count": 200,
+            "file_size_in_bytes": 2048,
+            "column_sizes": [
+                { "key": 1, "value": 1024 }
+            ],
+            "value_counts": [
+                { "key": 1, "value": 200 }
+            ],
+            "null_value_counts": [
+                { "key": 1, "value": 10 }
+            ],
+            "nan_value_counts": [],
+            "lower_bounds": [],
+            "upper_bounds": [],
+            "key_metadata": null,
+            "split_offsets": [],
+            "equality_ids": [],
+            "sort_order_id": null,
+            "first_row_id": null,
+            "referenced_data_file": null,
+            "content_offset": null,
+            "content_size_in_bytes": null
+        });

Review Comment:
   Seems it still matters in below outtput. One approach is to change [this 
function](https://github.com/apache/iceberg-rust/blob/8dd4a2216728c6fcad42a4d48ea7e39bef30517d/crates/iceberg/src/spec/manifest/_serde.rs#L295)
 to make it determined:
   ```rust
   #[cfg(test)]
   // sort the keys.
   ```
   
   



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] feat(spec): Expose DataFile serde functions [iceberg-rust]

Reply via email to