geruh commented on code in PR #614:
URL: https://github.com/apache/iceberg-python/pull/614#discussion_r1571245762
##########
tests/integration/test_inspect_table.py:
##########
@@ -445,3 +445,65 @@ def check_pyiceberg_df_equals_spark_df(df: pa.Table,
spark_df: DataFrame) -> Non
df = tbl.inspect.partitions(snapshot_id=snapshot.snapshot_id)
spark_df = spark.sql(f"SELECT * FROM {identifier}.partitions VERSION
AS OF {snapshot.snapshot_id}")
check_pyiceberg_df_equals_spark_df(df, spark_df)
+
+
[email protected]
[email protected]("format_version", [1, 2])
+def test_inspect_files(
+ spark: SparkSession, session_catalog: Catalog, arrow_table_with_null:
pa.Table, format_version: int
+) -> None:
+ identifier = "default.table_metadata_files"
+ tbl = _create_table(session_catalog, identifier,
properties={"format-version": format_version})
+
+ tbl.overwrite(arrow_table_with_null)
+
+ # append more data
+ tbl.append(arrow_table_with_null)
+
+ df = tbl.refresh().inspect.files()
+
+ assert df.column_names == [
+ 'content',
+ 'file_path',
+ 'file_format',
+ 'record_count',
+ 'file_size_in_bytes',
+ 'column_sizes',
+ 'value_counts',
+ 'null_value_counts',
+ 'nan_value_counts',
+ 'lower_bounds',
+ 'upper_bounds',
+ 'key_metadata',
+ 'split_offsets',
+ 'equality_ids',
+ ]
+
+ for file_size_in_bytes in df['file_size_in_bytes']:
+ assert isinstance(file_size_in_bytes.as_py(), int)
+
+ for split_offsets in df['split_offsets']:
+ assert isinstance(split_offsets.as_py(), list)
+
+ for file_format in df['file_format']:
+ assert file_format.as_py() == "PARQUET"
+
+ for file_path in df['file_path']:
+ assert file_path.as_py().startswith("s3://")
+
+ lhs = spark.table(f"{identifier}.files").toPandas()
Review Comment:
The lhs table schema is different from this schema, is this expected?
```
Data columns (total 17 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 content 2 non-null int32
1 file_path 2 non-null object
2 file_format 2 non-null object
3 spec_id 2 non-null int32
4 record_count 2 non-null int64
5 file_size_in_bytes 2 non-null int64
6 column_sizes 2 non-null object
7 value_counts 2 non-null object
8 null_value_counts 2 non-null object
9 nan_value_counts 2 non-null object
10 lower_bounds 2 non-null object
11 upper_bounds 2 non-null object
12 key_metadata 0 non-null object
13 split_offsets 2 non-null object
14 equality_ids 0 non-null object
15 sort_order_id 0 non-null float64
16 readable_metrics 2 non-null object
dtypes: float64(1), int32(2), int64(2), object(12)
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]