Re: [PR] Add Files metadata table [iceberg-python]

via GitHub Thu, 18 Apr 2024 12:16:31 -0700


geruh commented on code in PR #614:
URL: https://github.com/apache/iceberg-python/pull/614#discussion_r1571245762



##########
tests/integration/test_inspect_table.py:
##########
@@ -445,3 +445,65 @@ def check_pyiceberg_df_equals_spark_df(df: pa.Table, 
spark_df: DataFrame) -> Non
         df = tbl.inspect.partitions(snapshot_id=snapshot.snapshot_id)
         spark_df = spark.sql(f"SELECT * FROM {identifier}.partitions VERSION 
AS OF {snapshot.snapshot_id}")
         check_pyiceberg_df_equals_spark_df(df, spark_df)
+
+
+@pytest.mark.integration
+@pytest.mark.parametrize("format_version", [1, 2])
+def test_inspect_files(
+    spark: SparkSession, session_catalog: Catalog, arrow_table_with_null: 
pa.Table, format_version: int
+) -> None:
+    identifier = "default.table_metadata_files"
+    tbl = _create_table(session_catalog, identifier, 
properties={"format-version": format_version})
+
+    tbl.overwrite(arrow_table_with_null)
+
+    # append more data
+    tbl.append(arrow_table_with_null)
+
+    df = tbl.refresh().inspect.files()
+
+    assert df.column_names == [
+        'content',
+        'file_path',
+        'file_format',
+        'record_count',
+        'file_size_in_bytes',
+        'column_sizes',
+        'value_counts',
+        'null_value_counts',
+        'nan_value_counts',
+        'lower_bounds',
+        'upper_bounds',
+        'key_metadata',
+        'split_offsets',
+        'equality_ids',
+    ]
+
+    for file_size_in_bytes in df['file_size_in_bytes']:
+        assert isinstance(file_size_in_bytes.as_py(), int)
+
+    for split_offsets in df['split_offsets']:
+        assert isinstance(split_offsets.as_py(), list)
+
+    for file_format in df['file_format']:
+        assert file_format.as_py() == "PARQUET"
+
+    for file_path in df['file_path']:
+        assert file_path.as_py().startswith("s3://")
+
+    lhs = spark.table(f"{identifier}.files").toPandas()

Review Comment:
   The lhs spark table schema is different from the inspect files schema, is 
this expected?
   
   ```
   Data columns (total 17 columns):
    #   Column              Non-Null Count  Dtype  
   ---  ------              --------------  -----  
    0   content             2 non-null      int32  
    1   file_path           2 non-null      object 
    2   file_format         2 non-null      object 
    3   spec_id             2 non-null      int32  
    4   record_count        2 non-null      int64  
    5   file_size_in_bytes  2 non-null      int64  
    6   column_sizes        2 non-null      object 
    7   value_counts        2 non-null      object 
    8   null_value_counts   2 non-null      object 
    9   nan_value_counts    2 non-null      object 
    10  lower_bounds        2 non-null      object 
    11  upper_bounds        2 non-null      object 
    12  key_metadata        0 non-null      object 
    13  split_offsets       2 non-null      object 
    14  equality_ids        0 non-null      object 
    15  sort_order_id       0 non-null      float64
    16  readable_metrics    2 non-null      object
   dtypes: float64(1), int32(2), int64(2), object(12)
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org
For additional commands, e-mail: issues-h...@iceberg.apache.org

Re: [PR] Add Files metadata table [iceberg-python]

Reply via email to