Re: [PR] Sanitized special character column name before writing to parquet [iceberg-python]

via GitHub Fri, 12 Apr 2024 18:42:29 -0700


kevinjqliu commented on code in PR #590:
URL: https://github.com/apache/iceberg-python/pull/590#discussion_r1563557100



##########
tests/integration/test_writes/test_writes.py:
##########
@@ -270,6 +270,48 @@ def get_current_snapshot_id(identifier: str) -> int:
     assert tbl.current_snapshot().snapshot_id == 
get_current_snapshot_id(identifier)  # type: ignore
 
 
+@pytest.mark.integration
+@pytest.mark.parametrize("format_version", [1, 2])
+def test_python_writes_special_character_column_with_spark_reads(
+    spark: SparkSession, session_catalog: Catalog, format_version: int
+) -> None:
+    identifier = 
"default.python_writes_special_character_column_with_spark_reads"
+    column_name_with_special_character = "letter/abc"
+    TEST_DATA_WITH_SPECIAL_CHARACTER_COLUMN = {
+        column_name_with_special_character: ['a', None, 'z'],
+        'id': [1, 2, 3],
+        'name': ['AB', 'CD', 'EF'],
+        'address': [
+            {'street': '123', 'city': 'SFO', 'zip': 12345, 
column_name_with_special_character: 'a'},
+            {'street': '456', 'city': 'SW', 'zip': 67890, 
column_name_with_special_character: 'b'},
+            {'street': '789', 'city': 'Random', 'zip': 10112, 
column_name_with_special_character: 'c'},
+        ],
+    }
+    pa_schema = pa.schema([
+        pa.field(column_name_with_special_character, pa.string()),
+        pa.field('id', pa.int32()),
+        pa.field('name', pa.string()),
+        pa.field(
+            'address',
+            pa.struct([
+                pa.field('street', pa.string()),
+                pa.field('city', pa.string()),
+                pa.field('zip', pa.int32()),
+                pa.field(column_name_with_special_character, pa.string()),
+            ]),
+        ),
+    ])
+    arrow_table_with_special_character_column = 
pa.Table.from_pydict(TEST_DATA_WITH_SPECIAL_CHARACTER_COLUMN, schema=pa_schema)
+    tbl = _create_table(session_catalog, identifier, {"format-version": 
format_version}, schema=pa_schema)
+
+    tbl.overwrite(arrow_table_with_special_character_column)
+    # PySpark toPandas() turns nested field into tuple by default, but returns 
the proper schema when Arrow is enabled
+    spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")

Review Comment:
   good catch! i didn't know about the fixture scope behavior. Moved to conftest



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org
For additional commands, e-mail: issues-h...@iceberg.apache.org

Re: [PR] Sanitized special character column name before writing to parquet [iceberg-python]

Reply via email to