Re: [PR] Add Files metadata table [iceberg-python]

via GitHub Mon, 29 Apr 2024 15:46:01 -0700


geruh commented on code in PR #614:
URL: https://github.com/apache/iceberg-python/pull/614#discussion_r1583803213



##########
tests/integration/test_inspect_table.py:
##########
@@ -445,3 +445,107 @@ def check_pyiceberg_df_equals_spark_df(df: pa.Table, 
spark_df: DataFrame) -> Non
         df = tbl.inspect.partitions(snapshot_id=snapshot.snapshot_id)
         spark_df = spark.sql(f"SELECT * FROM {identifier}.partitions VERSION 
AS OF {snapshot.snapshot_id}")
         check_pyiceberg_df_equals_spark_df(df, spark_df)
+
+
+@pytest.mark.integration
+@pytest.mark.parametrize("format_version", [1, 2])
+def test_inspect_files(
+    spark: SparkSession, session_catalog: Catalog, arrow_table_with_null: 
pa.Table, format_version: int
+) -> None:
+    identifier = "default.table_metadata_files"
+
+    tbl = _create_table(session_catalog, identifier, 
properties={"format-version": format_version})
+
+    tbl.overwrite(arrow_table_with_null)
+
+    # append more data
+    tbl.append(arrow_table_with_null)
+
+    df = tbl.refresh().inspect.files()
+
+    assert df.column_names == [
+        'content',
+        'file_path',
+        'file_format',
+        'spec_id',
+        'record_count',
+        'file_size_in_bytes',
+        'column_sizes',
+        'value_counts',
+        'null_value_counts',
+        'nan_value_counts',
+        'lower_bounds',
+        'upper_bounds',
+        'key_metadata',
+        'split_offsets',
+        'equality_ids',
+        'sort_order_id',
+        'readable_metrics',
+    ]
+
+    # make sure the non-nullable fields are filled
+    for int_column in ['content', 'spec_id', 'record_count', 
'file_size_in_bytes']:
+        for value in df[int_column]:
+            assert isinstance(value.as_py(), int)
+
+    for split_offsets in df['split_offsets']:
+        assert isinstance(split_offsets.as_py(), list)
+
+    for file_format in df['file_format']:
+        assert file_format.as_py() == "PARQUET"
+
+    for file_path in df['file_path']:
+        assert file_path.as_py().startswith("s3://")
+
+    lhs = df.to_pandas()
+    rhs = spark.table(f"{identifier}.files").toPandas()
+    for column in df.column_names:
+        for left, right in zip(lhs[column].to_list(), rhs[column].to_list()):
+            if isinstance(left, float) and math.isnan(left) and 
isinstance(right, float) and math.isnan(right):
+                # NaN != NaN in Python
+                continue
+            if column in [
+                'column_sizes',
+                'value_counts',
+                'null_value_counts',
+                'nan_value_counts',
+                'lower_bounds',
+                'upper_bounds',
+            ]:
+                # Arrow returns a list of tuples, instead of a dict
+                left = dict(left)

Review Comment:
   left values aren't being used to assert here, also these are nested tuples 
so you might want to iterate through the map and convert to dict



##########
tests/integration/test_inspect_table.py:
##########
@@ -445,3 +445,107 @@ def check_pyiceberg_df_equals_spark_df(df: pa.Table, 
spark_df: DataFrame) -> Non
         df = tbl.inspect.partitions(snapshot_id=snapshot.snapshot_id)
         spark_df = spark.sql(f"SELECT * FROM {identifier}.partitions VERSION 
AS OF {snapshot.snapshot_id}")
         check_pyiceberg_df_equals_spark_df(df, spark_df)
+
+
+@pytest.mark.integration
+@pytest.mark.parametrize("format_version", [1, 2])
+def test_inspect_files(
+    spark: SparkSession, session_catalog: Catalog, arrow_table_with_null: 
pa.Table, format_version: int
+) -> None:
+    identifier = "default.table_metadata_files"
+
+    tbl = _create_table(session_catalog, identifier, 
properties={"format-version": format_version})
+
+    tbl.overwrite(arrow_table_with_null)
+
+    # append more data
+    tbl.append(arrow_table_with_null)
+
+    df = tbl.refresh().inspect.files()
+
+    assert df.column_names == [
+        'content',
+        'file_path',
+        'file_format',
+        'spec_id',
+        'record_count',
+        'file_size_in_bytes',
+        'column_sizes',
+        'value_counts',
+        'null_value_counts',
+        'nan_value_counts',
+        'lower_bounds',
+        'upper_bounds',
+        'key_metadata',
+        'split_offsets',
+        'equality_ids',
+        'sort_order_id',
+        'readable_metrics',
+    ]
+
+    # make sure the non-nullable fields are filled
+    for int_column in ['content', 'spec_id', 'record_count', 
'file_size_in_bytes']:
+        for value in df[int_column]:
+            assert isinstance(value.as_py(), int)
+
+    for split_offsets in df['split_offsets']:
+        assert isinstance(split_offsets.as_py(), list)
+
+    for file_format in df['file_format']:
+        assert file_format.as_py() == "PARQUET"
+
+    for file_path in df['file_path']:
+        assert file_path.as_py().startswith("s3://")
+
+    lhs = df.to_pandas()
+    rhs = spark.table(f"{identifier}.files").toPandas()
+    for column in df.column_names:
+        for left, right in zip(lhs[column].to_list(), rhs[column].to_list()):
+            if isinstance(left, float) and math.isnan(left) and 
isinstance(right, float) and math.isnan(right):
+                # NaN != NaN in Python
+                continue
+            if column in [
+                'column_sizes',
+                'value_counts',
+                'null_value_counts',
+                'nan_value_counts',
+                'lower_bounds',
+                'upper_bounds',
+            ]:
+                # Arrow returns a list of tuples, instead of a dict
+                left = dict(left)
+            elif column == 'readable_metrics':
+                assert list(left.keys()) == [
+                    'bool',
+                    'string',
+                    'string_long',
+                    'int',
+                    'long',
+                    'float',
+                    'double',
+                    'timestamp',
+                    'timestamptz',
+                    'date',
+                    'binary',
+                    'fixed',
+                ]
+                assert left.keys() == right.asDict().keys()

Review Comment:
   This asDict method fails for me 



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org
For additional commands, e-mail: issues-h...@iceberg.apache.org

Re: [PR] Add Files metadata table [iceberg-python]

Reply via email to