kevinjqliu commented on code in PR #2245:
URL: https://github.com/apache/iceberg-python/pull/2245#discussion_r2249128495
##########
tests/integration/test_writes/test_writes.py:
##########
@@ -1201,6 +1201,49 @@ def test_sanitize_character_partitioned(catalog:
Catalog) -> None:
assert len(tbl.scan().to_arrow()) == 22
[email protected]
[email protected]("catalog", [pytest.lazy_fixture("session_catalog")])
+def test_sanitize_character_partitioned_avro_bug(catalog: Catalog) -> None:
+ table_name = "default.test_table_partitioned_sanitized_character_avro"
+ try:
+ catalog.drop_table(table_name)
+ except NoSuchTableError:
+ pass
+
+ schema = Schema(
+ NestedField(id=1, name="😎", field_type=StringType(), required=False),
+ )
+
+ partition_spec = PartitionSpec(
+ PartitionField(
+ source_id=1,
+ field_id=1001,
+ transform=IdentityTransform(),
+ name="😎",
+ )
+ )
+
+ tbl = _create_table(
+ session_catalog=catalog,
+ identifier=table_name,
+ schema=schema,
+ partition_spec=partition_spec,
+ data=[
+ pa.Table.from_arrays(
+ [pa.array([str(i) for i in range(22)])],
schema=pa.schema([pa.field("😎", pa.string(), nullable=False)])
+ )
+ ],
+ )
+
+ assert len(tbl.scan().to_arrow()) == 22
+
+ con = tbl.scan().to_duckdb("table_test_debug")
+ result = con.query("SELECT * FROM table_test_debug").fetchall()
+ assert len(result) == 22
+
+ assert con.query("SHOW table_test_debug").fetchone() == ("😎", "VARCHAR",
"YES", None, None, None)
Review Comment:
ah i tested the wrong thing in the reference PR. This is still using
pyiceberg table scan to do the reading! We want to use duckdb to read instead.
I had the fix in my local branch. This fails in main branch but is fixed
with this PR
```suggestion
# verify that we can read the table with DuckDB
import duckdb
location = tbl.metadata_location
duckdb.sql("INSTALL iceberg; LOAD iceberg;")
# Configure S3 settings for DuckDB to match the catalog configuration
duckdb.sql("SET s3_endpoint='localhost:9000';")
duckdb.sql("SET s3_access_key_id='admin';")
duckdb.sql("SET s3_secret_access_key='password';")
duckdb.sql("SET s3_use_ssl=false;")
duckdb.sql("SET s3_url_style='path';")
result = duckdb.sql(f"SELECT * FROM
iceberg_scan('{location}')").fetchall()
assert len(result) == 10
```
##########
tests/test_avro_sanitization.py:
##########
Review Comment:
nit: looks like some of these tests are redundant, can we consolidate some
of them?
these 3 are quiet similar, we can just add all the different pairs to the
same test
```
test_avro_field_name_sanitization
test_edge_cases
test_edge_cases_sanitization
```
so are these 3
```
test_avro_compatibility
test_complex_schema_sanitization
test_avro_schema_conversion_sanitization
```
and we might be able to combine these 2
```
test_avro_file_structure_verification
test_emoji_field_name_sanitization
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]