PetrasTYR opened a new issue, #1087:
URL: https://github.com/apache/iceberg-python/issues/1087
### Apache Iceberg version
0.7.1 (latest release)
### Please describe the bug 🐞
Hello!
I am trying to use Nessie as an iceberg catalog, specifically the REST
catalog that the nessie server provides and writing to s3 bucket in localstack.
but i face this error: `OSError: When initiating multiple part upload for key
'rpmd/performance_68d55afc-e4d7-4055-932c-25f52e0e2f74/data/00000-0-b33665ff-ccbf-426f-bbd1-94353801131a.parquet'
in bucket 'warehouse': AWS Error NETWORK_CONNECTION during
CreateMultipartUpload operation: Encountered network error when sending http
request`
docker-compose.yaml:
```
# version: '3.9'
services:
nessie:
image: ghcr.io/projectnessie/nessie:0.95.0
container_name: nessie
ports:
- 19120:19120
- 9001:9000
environment:
- NESSIE_SERVER_DEFAULT-BRANCH=main
- NESSIE_VERSION_STORE_TYPE=DYNAMODB
- QUARKUS_DYNAMODB_AWS_REGION=ap-southeast-1
- QUARKUS_DYNAMODB_ENDPOINT-OVERRIDE=http://localstack:4566
- NESSIE_CATALOG_WAREHOUSEDEFAULTS_DEFAULT-WAREHOUSE=warehouse
- NESSIE_CATALOG_DEFAULT-WAREHOUSE=warehouse
- NESSIE_CATALOG_WAREHOUSES_WAREHOUSE_LOCATION=s3://warehouse
- NESSIE_CATALOG_SERVICE_S3_DEFAULT-OPTIONS_REGION=ap-southeast-1
- NESSIE_CATALOG_SERVICE_S3_DEFAULT-OPTIONS_ACCESS-KEY_NAME=fake
- NESSIE_CATALOG_SERVICE_S3_DEFAULT-OPTIONS_ACCESS-KEY_SECRET=password
-
NESSIE_CATALOG_SERVICE_S3_DEFAULT-OPTIONS_ENDPOINT=http://localstack:4566
- NESSIE_CATALOG_SERVICE_S3_DEFAULT-OPTIONS_PATH-STYLE-ACCESS=true
- NESSIE_SERVER_AUTHENTICATION_ENABLED=false
- AWS_ACCESS_KEY_ID=fake
- AWS_SECRET_ACCESS_KEY=password
- AWS_REGION=ap-southeast-1
networks:
- nessie-network
trino:
image: trinodb/trino:453
container_name: trino
ports:
- 8080:8080
networks:
- nessie-network
volumes:
- "./iceberg.properties:/etc/trino/catalog/iceberg.properties"
- "./iceberg.properties:/etc/catalog/iceberg.properties"
- "./config.properties:/etc/trino/config.properties"
trino-worker:
image: trinodb/trino:453
container_name: trino-worker
ports:
- 8081:8080
networks:
- nessie-network
volumes:
- "./iceberg.properties:/etc/trino/catalog/iceberg.properties"
- "./worker-config.properties:/etc/trino/config.properties"
localstack:
image: localstack/localstack:latest
container_name: localstack
ports:
- 4566:4566
networks:
- nessie-network
environment:
- DEBUG=${DEBUG-}
- DOCKER_HOST=unix:///var/run/docker.sock
- SKIP_SSL_CERT_DOWNLOAD=1
- EDGE_PORT=4566
networks:
nessie-network:
driver: bridge
```
my python file:
```
from pyiceberg.catalog import load_rest
from pyiceberg.schema import Schema
from pyiceberg.types import StringType, NestedField, DoubleType
from pyiceberg.partitioning import PartitionSpec, PartitionField
import numpy as np
import pandas as pd
rows = 10**1
ncols = 10
countries = ["US", "CA", "UK"]
attr2 = ["OPEN", "CLOSE", "LOW", "HIGH"]
dates = pd.date_range("2020-01-01", "2020-12-31")
data_orig = pd.DataFrame(
[
{
"countries": countries[i % len(countries)],
"status": attr2[i % len(attr2)],
"return_index": np.random.rand(),
}
for i in range(rows)
]
)
schema = Schema(
NestedField(field_id=1, name="countries", field_type=StringType(),
required=False),
NestedField(field_id=1, name="status", field_type=StringType(),
required=False),
NestedField(
field_id=1, name="return_index", field_type=DoubleType(),
required=False
),
)
partition_spec = PartitionSpec(
fields=[
PartitionField(
source_id=1, field_id=1000, name="countries",
transform="identity"
),
PartitionField(source_id=2, field_id=1001, name="status",
transform="identity"),
]
)
catalog = load_rest(
"rest",
conf={
"uri": "http://localhost:19120/iceberg",
},
)
catalog.create_namespace_if_not_exists("rpmd")
tables = catalog.list_tables(namespace="rpmd")
print("tables")
print(tables)
table = catalog.create_table_if_not_exists(
identifier="rpmd.performance",
schema=schema,
# partition_spec=partition_spec
)
import pyarrow as pa
import pyarrow.parquet as pq
tbl = pa.Table.from_pandas(data_orig)
# pq.write_table()
table.append(tbl)
```
The stacktrace:
```
Traceback (most recent call last):
File
"C:\Users\USER\AppData\Local\pypoetry\Cache\virtualenvs\python-scripts-USencysj-py3.11\Lib\site-packages\pyiceberg\table\__init__.py",
line 509, in append
for data_file in data_files:
File
"C:\Users\USER\AppData\Local\pypoetry\Cache\virtualenvs\python-scripts-USencysj-py3.11\Lib\site-packages\pyiceberg\io\pyarrow.py",
line 2354, in _dataframe_to_data_files
yield from write_file(
File
"C:\Users\USER\AppData\Local\Programs\Python\Python311\Lib\concurrent\futures\_base.py",
line 619, in result_iterator
yield _result_or_cancel(fs.pop())
^^^^^^^^^^^^^^^^^^^^^^^^^^^
File
"C:\Users\USER\AppData\Local\Programs\Python\Python311\Lib\concurrent\futures\_base.py",
line 317, in _result_or_cancel
return fut.result(timeout)
^^^^^^^^^^^^^^^^^^^
File
"C:\Users\USER\AppData\Local\Programs\Python\Python311\Lib\concurrent\futures\_base.py",
line 456, in result
return self.__get_result()
^^^^^^^^^^^^^^^^^^^
File
"C:\Users\USER\AppData\Local\Programs\Python\Python311\Lib\concurrent\futures\_base.py",
line 401, in __get_result
raise self._exception
File
"C:\Users\USER\AppData\Local\Programs\Python\Python311\Lib\concurrent\futures\thread.py",
line 58, in run
result = self.fn(*self.args, **self.kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File
"C:\Users\USER\AppData\Local\pypoetry\Cache\virtualenvs\python-scripts-USencysj-py3.11\Lib\site-packages\pyiceberg\io\pyarrow.py",
line 2174, in write_parquet
with fo.create(overwrite=True) as fos:
^^^^^^^^^^^^^^^^^^^^^^^^^
File
"C:\Users\USER\AppData\Local\pypoetry\Cache\virtualenvs\python-scripts-USencysj-py3.11\Lib\site-packages\pyiceberg\io\pyarrow.py",
line 307, in create
output_file = self._filesystem.open_output_stream(self._path,
buffer_size=self._buffer_size)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "pyarrow\\_fs.pyx", line 887, in
pyarrow._fs.FileSystem.open_output_stream
File "pyarrow\\error.pxi", line 155, in
pyarrow.lib.pyarrow_internal_check_status
File "pyarrow\\error.pxi", line 92, in pyarrow.lib.check_status
OSError: When initiating multiple part upload for key
'rpmd/performance_68d55afc-e4d7-4055-932c-25f52e0e2f74/data/00000-0-b33665ff-ccbf-426f-bbd1-94353801131a.parquet'
in bucket 'warehouse': AWS Error NETWORK_CONNECTION during
CreateMultipartUpload operation: Encountered network error when sending http
request
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:\Users\USER\Documents\Code\Playground\python-scripts\iceberg.py",
line 60, in <module>
table.append(tbl)
File
"C:\Users\USER\AppData\Local\pypoetry\Cache\virtualenvs\python-scripts-USencysj-py3.11\Lib\site-packages\pyiceberg\table\__init__.py",
line 1578, in append
tx.append(df=df, snapshot_properties=snapshot_properties)
File
"C:\Users\USER\AppData\Local\pypoetry\Cache\virtualenvs\python-scripts-USencysj-py3.11\Lib\site-packages\pyiceberg\table\__init__.py",
line 503, in append
with append_method() as append_files:
File
"C:\Users\USER\AppData\Local\pypoetry\Cache\virtualenvs\python-scripts-USencysj-py3.11\Lib\site-packages\pyiceberg\table\__init__.py",
line 2094, in __exit__
self.commit()
File
"C:\Users\USER\AppData\Local\pypoetry\Cache\virtualenvs\python-scripts-USencysj-py3.11\Lib\site-packages\pyiceberg\table\__init__.py",
line 2090, in commit
self._transaction._apply(*self._commit())
^^^^^^^^^^^^^^
File
"C:\Users\USER\AppData\Local\pypoetry\Cache\virtualenvs\python-scripts-USencysj-py3.11\Lib\site-packages\pyiceberg\table\__init__.py",
line 3218, in _commit
with write_manifest_list(
File
"C:\Users\USER\AppData\Local\pypoetry\Cache\virtualenvs\python-scripts-USencysj-py3.11\Lib\site-packages\pyiceberg\manifest.py",
line 924, in __enter__
self._writer.__enter__()
File
"C:\Users\USER\AppData\Local\pypoetry\Cache\virtualenvs\python-scripts-USencysj-py3.11\Lib\site-packages\pyiceberg\avro\file.py",
line 258, in __enter__
self.output_stream = self.output_file.create(overwrite=True)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File
"C:\Users\USER\AppData\Local\pypoetry\Cache\virtualenvs\python-scripts-USencysj-py3.11\Lib\site-packages\pyiceberg\io\pyarrow.py",
line 307, in create
output_file = self._filesystem.open_output_stream(self._path,
buffer_size=self._buffer_size)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "pyarrow\\_fs.pyx", line 887, in
pyarrow._fs.FileSystem.open_output_stream
File "pyarrow\\error.pxi", line 155, in
pyarrow.lib.pyarrow_internal_check_status
File "pyarrow\\error.pxi", line 92, in pyarrow.lib.check_status
OSError: When initiating multiple part upload for key
'rpmd/performance_68d55afc-e4d7-4055-932c-25f52e0e2f74/metadata/snap-2642215192225696858-0-b33665ff-ccbf-426f-bbd1-94353801131a.avro'
in bucket 'warehouse': AWS Error NETWORK_CONNECTION during
CreateMultipartUpload operation: Encountered network error when sending http
request
```
I've tried the solution in issue #974 but it still does not work.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]