[GitHub] [iceberg] Fokko commented on a diff in pull request #7831: Python: Compute parquet stats

via GitHub Wed, 09 Aug 2023 12:05:59 -0700


Fokko commented on code in PR #7831:
URL: https://github.com/apache/iceberg/pull/7831#discussion_r1289075132



##########
python/tests/io/test_pyarrow.py:
##########
@@ -1345,3 +1374,655 @@ def test_pyarrow_wrap_fsspec(example_task: 
FileScanTask, table_schema_simple: Sc
 bar: [[1,2,3]]
 baz: [[true,false,null]]"""
     )
+
+
+def construct_test_table() -> Tuple[Any, Any, Union[TableMetadataV1, 
TableMetadataV2]]:
+    table_metadata = {
+        "format-version": 2,
+        "location": "s3://bucket/test/location",
+        "last-column-id": 7,
+        "current-schema-id": 0,
+        "schemas": [
+            {
+                "type": "struct",
+                "schema-id": 0,
+                "fields": [
+                    {"id": 1, "name": "strings", "required": False, "type": 
"string"},
+                    {"id": 2, "name": "floats", "required": False, "type": 
"float"},
+                    {
+                        "id": 3,
+                        "name": "list",
+                        "required": False,
+                        "type": {"type": "list", "element-id": 5, "element": 
"long", "element-required": False},
+                    },
+                    {
+                        "id": 4,
+                        "name": "maps",
+                        "required": False,
+                        "type": {
+                            "type": "map",
+                            "key-id": 6,
+                            "key": "long",
+                            "value-id": 7,
+                            "value": "long",
+                            "value-required": False,
+                        },
+                    },
+                ],
+            },
+        ],
+        "default-spec-id": 0,
+        "partition-specs": [{"spec-id": 0, "fields": []}],
+        "properties": {},
+    }
+
+    table_metadata = TableMetadataUtil.parse_obj(table_metadata)
+    arrow_schema = schema_to_pyarrow(table_metadata.schemas[0])
+
+    _strings = ["zzzzzzzzzzzzzzzzzzzz", "rrrrrrrrrrrrrrrrrrrr", None, 
"aaaaaaaaaaaaaaaaaaaa"]
+
+    _floats = [3.14, math.nan, 1.69, 100]
+
+    _list = [[1, 2, 3], [4, 5, 6], None, [7, 8, 9]]
+
+    _maps: List[Optional[Dict[int, int]]] = [
+        {1: 2, 3: 4},
+        None,
+        {5: 6},
+        {},
+    ]
+
+    table = pa.Table.from_pydict(
+        {
+            "strings": _strings,
+            "floats": _floats,
+            "list": _list,
+            "maps": _maps,
+        },
+        schema=arrow_schema,
+    )
+    metadata_collector: List[Any] = []
+
+    with pa.BufferOutputStream() as f:
+        with pq.ParquetWriter(f, table.schema, 
metadata_collector=metadata_collector) as writer:
+            writer.write_table(table)
+
+        return f.getvalue(), metadata_collector[0], table_metadata
+
+
+def test_record_count() -> None:
+    (file_bytes, metadata, table_metadata) = construct_test_table()
+
+    datafile = DataFile()
+    fill_parquet_file_metadata(datafile, metadata, len(file_bytes), 
table_metadata)
+
+    assert datafile.record_count == 4
+
+
+def test_file_size() -> None:
+    (file_bytes, metadata, table_metadata) = construct_test_table()
+
+    datafile = DataFile()
+    fill_parquet_file_metadata(datafile, metadata, len(file_bytes), 
table_metadata)
+
+    assert datafile.file_size_in_bytes == len(file_bytes)
+
+
+def test_value_counts() -> None:
+    (file_bytes, metadata, table_metadata) = construct_test_table()
+
+    datafile = DataFile()
+    fill_parquet_file_metadata(datafile, metadata, len(file_bytes), 
table_metadata)
+
+    assert len(datafile.value_counts) == 5
+    assert datafile.value_counts[1] == 4
+    assert datafile.value_counts[2] == 4
+    assert datafile.value_counts[5] == 10  # 3 lists with 3 items and a None 
value
+    assert datafile.value_counts[6] == 5

Review Comment:
   ```sql
   CREATE TABLE default.arrays
   SELECT
       array(1, 2, 3, null) as with_a_null,
       array(1, 2, cast('NaN' as double), null) as with_null_and_nan,
       array(1, 2, 3, cast('NaN' as double)) as with_a_nan
   ```
   
   Schema:
   ```json
   {
        "type": "struct",
        "schema-id": 0,
        "fields": [{
                "id": 1,
                "name": "with_a_null",
                "required": false,
                "type": {
                        "type": "list",
                        "element-id": 4,
                        "element": "int",
                        "element-required": false
                }
        }, {
                "id": 2,
                "name": "with_null_and_nan",
                "required": false,
                "type": {
                        "type": "list",
                        "element-id": 5,
                        "element": "double",
                        "element-required": false
                }
        }, {
                "id": 3,
                "name": "with_a_nan",
                "required": false,
                "type": {
                        "type": "list",
                        "element-id": 6,
                        "element": "double",
                        "element-required": false
                }
        }]
   }
   ```
   
   The missing upper- and lower bound seems to be an issue with Spark. It does 
generates the doubles:
   
   ```json
   {
        "status": 1,
        "snapshot_id": {
                "long": 3911973389432551915
        },
        "data_file": {
                "file_path": 
"s3://warehouse/default/arrays/data/00000-0-1e0d4a20-4473-450e-9b1e-9196b5edac66-00001.parquet",
                "file_format": "PARQUET",
                "partition": {},
                "record_count": 1,
                "file_size_in_bytes": 1310,
                "block_size_in_bytes": 67108864,
                "column_sizes": {
                        "array": [{
                                "key": 4,
                                "value": 60
                        }, {
                                "key": 5,
                                "value": 63
                        }, {
                                "key": 6,
                                "value": 66
                        }]
                },
                "value_counts": {
                        "array": [{
                                "key": 4,
                                "value": 4
                        }, {
                                "key": 5,
                                "value": 4
                        }, {
                                "key": 6,
                                "value": 4
                        }]
                },
                "null_value_counts": {
                        "array": [{
                                "key": 4,
                                "value": 1
                        }, {
                                "key": 5,
                                "value": 1
                        }, {
                                "key": 6,
                                "value": 0
                        }]
                },
                "nan_value_counts": {
                        "array": [{
                                "key": 5,
                                "value": 1
                        }, {
                                "key": 6,
                                "value": 1
                        }]
                },
                "lower_bounds": {
                        "array": [{
                                "key": 5,
                                "value": 
"\u0000\u0000\u0000\u0000\u0000\u0000ð?"
                        }, {
                                "key": 6,
                                "value": 
"\u0000\u0000\u0000\u0000\u0000\u0000ð?"
                        }]
                },
                "upper_bounds": {
                        "array": [{
                                "key": 5,
                                "value": 
"\u0000\u0000\u0000\u0000\u0000\u0000\u0000@"
                        }, {
                                "key": 6,
                                "value": 
"\u0000\u0000\u0000\u0000\u0000\u0000\b@"
                        }]
                },
                "key_metadata": null,
                "split_offsets": {
                        "array": [4]
                },
                "sort_order_id": {
                        "int": 0
                }
        }
   }
   ```



##########
python/tests/io/test_pyarrow.py:
##########
@@ -1345,3 +1374,655 @@ def test_pyarrow_wrap_fsspec(example_task: 
FileScanTask, table_schema_simple: Sc
 bar: [[1,2,3]]
 baz: [[true,false,null]]"""
     )
+
+
+def construct_test_table() -> Tuple[Any, Any, Union[TableMetadataV1, 
TableMetadataV2]]:
+    table_metadata = {
+        "format-version": 2,
+        "location": "s3://bucket/test/location",
+        "last-column-id": 7,
+        "current-schema-id": 0,
+        "schemas": [
+            {
+                "type": "struct",
+                "schema-id": 0,
+                "fields": [
+                    {"id": 1, "name": "strings", "required": False, "type": 
"string"},
+                    {"id": 2, "name": "floats", "required": False, "type": 
"float"},
+                    {
+                        "id": 3,
+                        "name": "list",
+                        "required": False,
+                        "type": {"type": "list", "element-id": 5, "element": 
"long", "element-required": False},
+                    },
+                    {
+                        "id": 4,
+                        "name": "maps",
+                        "required": False,
+                        "type": {
+                            "type": "map",
+                            "key-id": 6,
+                            "key": "long",
+                            "value-id": 7,
+                            "value": "long",
+                            "value-required": False,
+                        },
+                    },
+                ],
+            },
+        ],
+        "default-spec-id": 0,
+        "partition-specs": [{"spec-id": 0, "fields": []}],
+        "properties": {},
+    }
+
+    table_metadata = TableMetadataUtil.parse_obj(table_metadata)
+    arrow_schema = schema_to_pyarrow(table_metadata.schemas[0])
+
+    _strings = ["zzzzzzzzzzzzzzzzzzzz", "rrrrrrrrrrrrrrrrrrrr", None, 
"aaaaaaaaaaaaaaaaaaaa"]
+
+    _floats = [3.14, math.nan, 1.69, 100]
+
+    _list = [[1, 2, 3], [4, 5, 6], None, [7, 8, 9]]
+
+    _maps: List[Optional[Dict[int, int]]] = [
+        {1: 2, 3: 4},
+        None,
+        {5: 6},
+        {},
+    ]
+
+    table = pa.Table.from_pydict(
+        {
+            "strings": _strings,
+            "floats": _floats,
+            "list": _list,
+            "maps": _maps,
+        },
+        schema=arrow_schema,
+    )
+    metadata_collector: List[Any] = []
+
+    with pa.BufferOutputStream() as f:
+        with pq.ParquetWriter(f, table.schema, 
metadata_collector=metadata_collector) as writer:
+            writer.write_table(table)
+
+        return f.getvalue(), metadata_collector[0], table_metadata
+
+
+def test_record_count() -> None:
+    (file_bytes, metadata, table_metadata) = construct_test_table()
+
+    datafile = DataFile()
+    fill_parquet_file_metadata(datafile, metadata, len(file_bytes), 
table_metadata)
+
+    assert datafile.record_count == 4
+
+
+def test_file_size() -> None:
+    (file_bytes, metadata, table_metadata) = construct_test_table()
+
+    datafile = DataFile()
+    fill_parquet_file_metadata(datafile, metadata, len(file_bytes), 
table_metadata)
+
+    assert datafile.file_size_in_bytes == len(file_bytes)
+
+
+def test_value_counts() -> None:
+    (file_bytes, metadata, table_metadata) = construct_test_table()
+
+    datafile = DataFile()
+    fill_parquet_file_metadata(datafile, metadata, len(file_bytes), 
table_metadata)
+
+    assert len(datafile.value_counts) == 5
+    assert datafile.value_counts[1] == 4
+    assert datafile.value_counts[2] == 4
+    assert datafile.value_counts[5] == 10  # 3 lists with 3 items and a None 
value
+    assert datafile.value_counts[6] == 5

Review Comment:
   ```sql
   CREATE TABLE default.arrays
   SELECT
       array(1, 2, 3, null) as with_a_null,
       array(1, 2, cast('NaN' as double), null) as with_null_and_nan,
       array(1, 2, 3, cast('NaN' as double)) as with_a_nan
   ```
   
   Schema:
   ```json
   {
        "type": "struct",
        "schema-id": 0,
        "fields": [{
                "id": 1,
                "name": "with_a_null",
                "required": false,
                "type": {
                        "type": "list",
                        "element-id": 4,
                        "element": "int",
                        "element-required": false
                }
        }, {
                "id": 2,
                "name": "with_null_and_nan",
                "required": false,
                "type": {
                        "type": "list",
                        "element-id": 5,
                        "element": "double",
                        "element-required": false
                }
        }, {
                "id": 3,
                "name": "with_a_nan",
                "required": false,
                "type": {
                        "type": "list",
                        "element-id": 6,
                        "element": "double",
                        "element-required": false
                }
        }]
   }
   ```
   
   The missing upper- and lower bound seems to be an issue with Spark. It does 
generates the doubles:
   
   ```json
   {
        "status": 1,
        "snapshot_id": {
                "long": 3911973389432551915
        },
        "data_file": {
                "file_path": 
"s3://warehouse/default/arrays/data/00000-0-1e0d4a20-4473-450e-9b1e-9196b5edac66-00001.parquet",
                "file_format": "PARQUET",
                "partition": {},
                "record_count": 1,
                "file_size_in_bytes": 1310,
                "block_size_in_bytes": 67108864,
                "column_sizes": {
                        "array": [{
                                "key": 4,
                                "value": 60
                        }, {
                                "key": 5,
                                "value": 63
                        }, {
                                "key": 6,
                                "value": 66
                        }]
                },
                "value_counts": {
                        "array": [{
                                "key": 4,
                                "value": 4
                        }, {
                                "key": 5,
                                "value": 4
                        }, {
                                "key": 6,
                                "value": 4
                        }]
                },
                "null_value_counts": {
                        "array": [{
                                "key": 4,
                                "value": 1
                        }, {
                                "key": 5,
                                "value": 1
                        }, {
                                "key": 6,
                                "value": 0
                        }]
                },
                "nan_value_counts": {
                        "array": [{
                                "key": 5,
                                "value": 1
                        }, {
                                "key": 6,
                                "value": 1
                        }]
                },
                "lower_bounds": {
                        "array": [{
                                "key": 5,
                                "value": 
"\u0000\u0000\u0000\u0000\u0000\u0000ð?"
                        }, {
                                "key": 6,
                                "value": 
"\u0000\u0000\u0000\u0000\u0000\u0000ð?"
                        }]
                },
                "upper_bounds": {
                        "array": [{
                                "key": 5,
                                "value": 
"\u0000\u0000\u0000\u0000\u0000\u0000\u0000@"
                        }, {
                                "key": 6,
                                "value": 
"\u0000\u0000\u0000\u0000\u0000\u0000\b@"
                        }]
                },
                "key_metadata": null,
                "split_offsets": {
                        "array": [4]
                },
                "sort_order_id": {
                        "int": 0
                }
        }
   }
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[GitHub] [iceberg] Fokko commented on a diff in pull request #7831: Python: Compute parquet stats

Reply via email to