kris-gaudel commented on issue #2325:
URL: 
https://github.com/apache/iceberg-python/issues/2325#issuecomment-3221265037

   @Declow I'm on macOS Sonomoa 14.6.1 and Python 3.11.6. 
   
   ```python
   from pyiceberg.catalog.memory import InMemoryCatalog
   import tracemalloc
   from datetime import datetime, timezone
   import polars as pl
   
   def generate_df():
       df = pl.DataFrame(
           {
               "event_type": ["playback"] * 1000,
               "event_origin": ["origin1"] * 1000,
               "event_send_at": [datetime.now(timezone.utc)] * 1000,
               "event_saved_at": [datetime.now(timezone.utc)] * 1000,
               "data": [
                   {
                       "calendarKey": "calendarKey",
                       "id": str(i),
                       "referenceId": f"ref-{i}",
                   }
                   for i in range(1000)
               ],
           }
       )
       return df
   
   df = generate_df()
   catalog = InMemoryCatalog("default", warehouse="/tmp/iceberg")
   catalog.create_namespace("default")
   
   df = generate_df()
   catalog = InMemoryCatalog("default", warehouse="/tmp/iceberg")
   catalog.create_namespace("default")
   table = iceberg_table = catalog.create_table(
       "default.leak", schema=df.to_arrow().schema, location="/tmp/iceberg/leak"
   )
   
   df = pl.DataFrame()
   import gc, objgraph
   
   from pyiceberg.manifest import _manifests
   
   def get_max_value_size(manifest_cache):
       max_key, max_size = max(
           ((key, len(value)) for key, value in manifest_cache.items()),
           key=lambda item: item[1],
           default=(None, -1)
       )
       return max_key, max_size
   
   
   tracemalloc.start()
   for i in range(200):
       df = generate_df()
       df.write_iceberg(table, mode="append")
   
       # debug
       manifest_cache = _manifests.cache
       print("ManifestFile instances alive:", objgraph.count("ManifestFile"))
       print(f"size: {len(manifest_cache)}, keys: {len(manifest_cache)}, 
values: {len(manifest_cache)}")
       max_key, max_size = get_max_value_size(manifest_cache)
       print(f"Max size of the cache value tuple: {max_size}, key: {max_key}")
       # gc.collect()
       snapshot = tracemalloc.take_snapshot()
       top_stats = snapshot.statistics("lineno")
       for stat in top_stats[:10]:
           print(stat)
   
       print()
   ```
   This is the script I used


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to