HonahX commented on code in PR #590: URL: https://github.com/apache/iceberg-python/pull/590#discussion_r1563400732
########## tests/integration/test_writes/test_writes.py: ########## @@ -270,6 +270,48 @@ def get_current_snapshot_id(identifier: str) -> int: assert tbl.current_snapshot().snapshot_id == get_current_snapshot_id(identifier) # type: ignore +@pytest.mark.integration +@pytest.mark.parametrize("format_version", [1, 2]) +def test_python_writes_special_character_column_with_spark_reads( + spark: SparkSession, session_catalog: Catalog, format_version: int +) -> None: + identifier = "default.python_writes_special_character_column_with_spark_reads" + column_name_with_special_character = "letter/abc" + TEST_DATA_WITH_SPECIAL_CHARACTER_COLUMN = { + column_name_with_special_character: ['a', None, 'z'], + 'id': [1, 2, 3], + 'name': ['AB', 'CD', 'EF'], + 'address': [ + {'street': '123', 'city': 'SFO', 'zip': 12345, column_name_with_special_character: 'a'}, + {'street': '456', 'city': 'SW', 'zip': 67890, column_name_with_special_character: 'b'}, + {'street': '789', 'city': 'Random', 'zip': 10112, column_name_with_special_character: 'c'}, + ], + } + pa_schema = pa.schema([ + pa.field(column_name_with_special_character, pa.string()), + pa.field('id', pa.int32()), + pa.field('name', pa.string()), + pa.field( + 'address', + pa.struct([ + pa.field('street', pa.string()), + pa.field('city', pa.string()), + pa.field('zip', pa.int32()), + pa.field(column_name_with_special_character, pa.string()), + ]), + ), + ]) + arrow_table_with_special_character_column = pa.Table.from_pydict(TEST_DATA_WITH_SPECIAL_CHARACTER_COLUMN, schema=pa_schema) + tbl = _create_table(session_catalog, identifier, {"format-version": format_version}, schema=pa_schema) + + tbl.overwrite(arrow_table_with_special_character_column) + # PySpark toPandas() turns nested field into tuple by default, but returns the proper schema when Arrow is enabled + spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true") Review Comment: Shall we add this to the spark fixture in `conftest.py`? Since the fixture's scope is "session", if we change the config here, all tests before this line will not have the configuration and all after this line will have this enabled. Moving it to the initialization part can ensure we have a consistent set of spark configs throughout the integration tests. WDYT? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org