kevinjqliu opened a new pull request, #1483: URL: https://github.com/apache/iceberg-python/pull/1483
Added reproducible test for #1194 Run `poetry run pytest tests/table/test_v1_table.py` Looking at the stack trace, one of the issues is `read_manifest_list` defaulting to use V2 manifest file list schema * https://github.com/apache/iceberg-python/blob/acd6f5a8a19db709e835e2686b87d4db3dca254f/pyiceberg/manifest.py#L651-L653 * https://github.com/apache/iceberg-python/blob/acd6f5a8a19db709e835e2686b87d4db3dca254f/pyiceberg/manifest.py#L62 Stack trace: ``` (3.9.21) ➜ iceberg-python git:(kevinjqliu/biglake-v1-table) poetry run pytest tests/table/test_v1_table.py ============================================================================== test session starts ============================================================================== platform darwin -- Python 3.12.8, pytest-7.4.4, pluggy-1.5.0 rootdir: /Users/kevinliu/repos/iceberg-python configfile: pyproject.toml plugins: checkdocs-2.10.1, anyio-4.7.0, mock-3.14.0, lazy-fixture-0.6.3, requests-mock-1.12.1 collected 1 item tests/table/test_v1_table.py F [100%] =================================================================================== FAILURES ==================================================================================== ____________________________________________________________________________ test_read_biglake_table ____________________________________________________________________________ def test_read_biglake_table() -> None: # There's a downloaded version of BigLake Iceberg table in ../../kevinliu_blmt/ current_dir = os.path.dirname(os.path.abspath(__file__)) warehouse_location = os.path.abspath(os.path.join(current_dir, "../../kevinliu_blmt")) metadata_json_file_path = warehouse_location + "/metadata/v1727125782.metadata.json" tbl = StaticTable.from_metadata(metadata_json_file_path) assert tbl # This is a V1 table assert tbl.format_version == 1 # And metadata currently refers to the original GCS bucket (gs://kevinliu_blmt) assert "gs://" in tbl.metadata.location assert "gs://" in tbl.metadata.current_snapshot().manifest_list # Let's redirect GCS bucket to local file path by overriding PyArrowFileIO's parse_location function GCS_BUCKET = "gs://kevinliu_blmt" LOCAL_WAREHOUSE = f"file://{warehouse_location}" def gcs_location_override(f): @wraps(f) def wrapper(location: str): if location.startswith(GCS_BUCKET): location = location.replace(GCS_BUCKET, LOCAL_WAREHOUSE) print(f"Redirected location: {location}") return f(location) return wrapper PyArrowFileIO.parse_location = staticmethod(gcs_location_override(PyArrowFileIO.parse_location)) # Now we can try to read the table > tbl.scan().to_pandas() tests/table/test_v1_table.py:56: _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ pyiceberg/table/__init__.py:1572: in to_pandas return self.to_arrow().to_pandas(**kwargs) pyiceberg/table/__init__.py:1539: in to_arrow ).to_table(self.plan_files()) pyiceberg/table/__init__.py:1470: in plan_files for manifest_file in snapshot.manifests(self.io) pyiceberg/table/snapshots.py:256: in manifests return list(_manifests(io, self.manifest_list)) ../../Library/Caches/pypoetry/virtualenvs/pyiceberg-Is5Rt7Ah-py3.12/lib/python3.12/site-packages/cachetools/__init__.py:752: in wrapper v = func(*args, **kwargs) pyiceberg/manifest.py:638: in _manifests return tuple(read_manifest_list(file)) pyiceberg/manifest.py:651: in read_manifest_list with AvroFile[ManifestFile]( pyiceberg/avro/file.py:177: in __enter__ self.reader = resolve_reader(self.schema, self.read_schema, self.read_types, self.read_enums) pyiceberg/avro/resolver.py:235: in resolve_reader return visit_with_partner(file_schema, read_schema, ReadSchemaResolver(read_types, read_enums), SchemaPartnerAccessor()) # type: ignore ../../.pyenv/versions/3.12.8/lib/python3.12/functools.py:909: in wrapper return dispatch(args[0].__class__)(*args, **kw) pyiceberg/schema.py:626: in _ return visitor.schema(schema, partner, visit_with_partner(schema.as_struct(), struct_partner, visitor, accessor)) # type: ignore ../../.pyenv/versions/3.12.8/lib/python3.12/functools.py:909: in wrapper return dispatch(args[0].__class__)(*args, **kw) pyiceberg/schema.py:641: in _ return visitor.struct(struct, partner, field_results) _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ self = <pyiceberg.avro.resolver.ReadSchemaResolver object at 0x103cf1590> struct = StructType(fields=(NestedField(field_id=503, name='added_snapshot_id', field_type=LongType(), required=True), NestedFi...Type(), required=True), NestedField(field_id=502, name='partition_spec_id', field_type=IntegerType(), required=True),)) expected_struct = StructType(fields=(NestedField(field_id=500, name='manifest_path', field_type=StringType(), required=True), NestedFiel...ired=True), required=False), NestedField(field_id=519, name='key_metadata', field_type=BinaryType(), required=False),)) field_readers = [IntegerReader(), IntegerReader(), StringReader(), IntegerReader()] def struct(self, struct: StructType, expected_struct: Optional[IcebergType], field_readers: List[Reader]) -> Reader: read_struct_id = self.context[STRUCT_ROOT] if len(self.context) > 0 else STRUCT_ROOT struct_callable = self.read_types.get(read_struct_id, Record) if not expected_struct: return StructReader(tuple(enumerate(field_readers)), struct_callable, struct) if not isinstance(expected_struct, StructType): raise ResolveError(f"File/read schema are not aligned for struct, got {expected_struct}") expected_positions: Dict[int, int] = {field.field_id: pos for pos, field in enumerate(expected_struct.fields)} # first, add readers for the file fields that must be in order results: List[Tuple[Optional[int], Reader]] = [ ( expected_positions.get(field.field_id), # Check if we need to convert it to an Enum result_reader if not (enum_type := self.read_enums.get(field.field_id)) else EnumReader(enum_type, result_reader), ) for field, result_reader in zip(struct.fields, field_readers) ] file_fields = {field.field_id for field in struct.fields} for pos, read_field in enumerate(expected_struct.fields): if read_field.field_id not in file_fields: if isinstance(read_field, NestedField) and read_field.initial_default is not None: # The field is not in the file, but there is a default value # and that one can be required results.append((pos, DefaultReader(read_field.initial_default))) elif read_field.required: > raise ResolveError(f"{read_field} is non-optional, and not part of the file schema") E pyiceberg.exceptions.ResolveError: 504: added_files_count: required int is non-optional, and not part of the file schema pyiceberg/avro/resolver.py:399: ResolveError ----------------------------------------------------------------------------- Captured stdout call ------------------------------------------------------------------------------ Redirected location: file:///Users/kevinliu/repos/iceberg-python/kevinliu_blmt/metadata/5745cfd0-32b6-43b6-af3a-a520d476eacb-585fd8f2f45f5d2c-f-manifest-list-00000-of-00001.avro ============================================================================ short test summary info ============================================================================ FAILED tests/table/test_v1_table.py::test_read_biglake_table - pyiceberg.exceptions.ResolveError: 504: added_files_count: required int is non-optional, and not part of the file schema =============================================================================== 1 failed in 0.62s =============================================================================== (3.9.21) ➜ iceberg-python git:(kevinjqliu/biglake-v1-table) ``` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org