geruh commented on code in PR #614: URL: https://github.com/apache/iceberg-python/pull/614#discussion_r1571245762
########## tests/integration/test_inspect_table.py: ########## @@ -445,3 +445,65 @@ def check_pyiceberg_df_equals_spark_df(df: pa.Table, spark_df: DataFrame) -> Non df = tbl.inspect.partitions(snapshot_id=snapshot.snapshot_id) spark_df = spark.sql(f"SELECT * FROM {identifier}.partitions VERSION AS OF {snapshot.snapshot_id}") check_pyiceberg_df_equals_spark_df(df, spark_df) + + +@pytest.mark.integration +@pytest.mark.parametrize("format_version", [1, 2]) +def test_inspect_files( + spark: SparkSession, session_catalog: Catalog, arrow_table_with_null: pa.Table, format_version: int +) -> None: + identifier = "default.table_metadata_files" + tbl = _create_table(session_catalog, identifier, properties={"format-version": format_version}) + + tbl.overwrite(arrow_table_with_null) + + # append more data + tbl.append(arrow_table_with_null) + + df = tbl.refresh().inspect.files() + + assert df.column_names == [ + 'content', + 'file_path', + 'file_format', + 'record_count', + 'file_size_in_bytes', + 'column_sizes', + 'value_counts', + 'null_value_counts', + 'nan_value_counts', + 'lower_bounds', + 'upper_bounds', + 'key_metadata', + 'split_offsets', + 'equality_ids', + ] + + for file_size_in_bytes in df['file_size_in_bytes']: + assert isinstance(file_size_in_bytes.as_py(), int) + + for split_offsets in df['split_offsets']: + assert isinstance(split_offsets.as_py(), list) + + for file_format in df['file_format']: + assert file_format.as_py() == "PARQUET" + + for file_path in df['file_path']: + assert file_path.as_py().startswith("s3://") + + lhs = spark.table(f"{identifier}.files").toPandas() Review Comment: The lhs spark table schema is different from the inspect files schema, is this expected? ``` Data columns (total 17 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 content 2 non-null int32 1 file_path 2 non-null object 2 file_format 2 non-null object 3 spec_id 2 non-null int32 4 record_count 2 non-null int64 5 file_size_in_bytes 2 non-null int64 6 column_sizes 2 non-null object 7 value_counts 2 non-null object 8 null_value_counts 2 non-null object 9 nan_value_counts 2 non-null object 10 lower_bounds 2 non-null object 11 upper_bounds 2 non-null object 12 key_metadata 0 non-null object 13 split_offsets 2 non-null object 14 equality_ids 0 non-null object 15 sort_order_id 0 non-null float64 16 readable_metrics 2 non-null object dtypes: float64(1), int32(2), int64(2), object(12) ``` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org