kris-gaudel commented on issue #2325:
URL:
https://github.com/apache/iceberg-python/issues/2325#issuecomment-3221265037
@Declow I'm on macOS Sonomoa 14.6.1 and Python 3.11.6.
```python
from pyiceberg.catalog.memory import InMemoryCatalog
import tracemalloc
from datetime import datetime, timezone
import polars as pl
def generate_df():
df = pl.DataFrame(
{
"event_type": ["playback"] * 1000,
"event_origin": ["origin1"] * 1000,
"event_send_at": [datetime.now(timezone.utc)] * 1000,
"event_saved_at": [datetime.now(timezone.utc)] * 1000,
"data": [
{
"calendarKey": "calendarKey",
"id": str(i),
"referenceId": f"ref-{i}",
}
for i in range(1000)
],
}
)
return df
df = generate_df()
catalog = InMemoryCatalog("default", warehouse="/tmp/iceberg")
catalog.create_namespace("default")
df = generate_df()
catalog = InMemoryCatalog("default", warehouse="/tmp/iceberg")
catalog.create_namespace("default")
table = iceberg_table = catalog.create_table(
"default.leak", schema=df.to_arrow().schema, location="/tmp/iceberg/leak"
)
df = pl.DataFrame()
import gc, objgraph
from pyiceberg.manifest import _manifests
def get_max_value_size(manifest_cache):
max_key, max_size = max(
((key, len(value)) for key, value in manifest_cache.items()),
key=lambda item: item[1],
default=(None, -1)
)
return max_key, max_size
tracemalloc.start()
for i in range(200):
df = generate_df()
df.write_iceberg(table, mode="append")
# debug
manifest_cache = _manifests.cache
print("ManifestFile instances alive:", objgraph.count("ManifestFile"))
print(f"size: {len(manifest_cache)}, keys: {len(manifest_cache)},
values: {len(manifest_cache)}")
max_key, max_size = get_max_value_size(manifest_cache)
print(f"Max size of the cache value tuple: {max_size}, key: {max_key}")
# gc.collect()
snapshot = tracemalloc.take_snapshot()
top_stats = snapshot.statistics("lineno")
for stat in top_stats[:10]:
print(stat)
print()
```
This is the script I used
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]