geruh commented on code in PR #2951:
URL: https://github.com/apache/iceberg-python/pull/2951#discussion_r2725867963


##########
pyiceberg/manifest.py:
##########
@@ -892,15 +891,49 @@ def __hash__(self) -> int:
         return hash(self.manifest_path)
 
 
-# Global cache for manifest lists
-_manifest_cache: LRUCache[Any, tuple[ManifestFile, ...]] = 
LRUCache(maxsize=128)
+# Global cache for individual ManifestFile objects, keyed by manifest_path.
+# This avoids duplicating ManifestFile objects when multiple manifest lists
+# share the same manifests (which is common after appends).
+_manifest_cache: LRUCache[str, ManifestFile] = LRUCache(maxsize=512)
+
+# Lock for thread-safe cache access
+_manifest_cache_lock = threading.RLock()
 
 
-@cached(cache=_manifest_cache, key=lambda io, manifest_list: 
hashkey(manifest_list), lock=threading.RLock())
 def _manifests(io: FileIO, manifest_list: str) -> tuple[ManifestFile, ...]:
-    """Read and cache manifests from the given manifest list, returning a 
tuple to prevent modification."""
+    """Read manifests from the given manifest list, caching individual 
ManifestFile objects.
+
+    Unlike caching entire manifest lists, this approach caches individual 
ManifestFile
+    objects by their manifest_path. This is more memory-efficient because:
+    - ManifestList1 contains: (ManifestFile1)
+    - ManifestList2 contains: (ManifestFile1, ManifestFile2)
+    - ManifestList3 contains: (ManifestFile1, ManifestFile2, ManifestFile3)
+
+    With per-ManifestFile caching, ManifestFile1 is stored only once and 
reused,
+    instead of being duplicated in each manifest list's cached tuple.
+
+    Args:
+        io: The FileIO to read the manifest list.
+        manifest_list: The path to the manifest list file.
+
+    Returns:
+        A tuple of ManifestFile objects (tuple to prevent modification).
+    """
     file = io.new_input(manifest_list)
-    return tuple(read_manifest_list(file))
+    result = []
+
+    for manifest_file in read_manifest_list(file):
+        manifest_path = manifest_file.manifest_path
+        with _manifest_cache_lock:

Review Comment:
   ```python
   all_manifests = list(read_manifest_list(file))
   
   result = []
   with _manifest_cache_lock:  <----- 
       for manifest_file in all_manifests:
   
   ```
   
   This seems to be safe wdyt?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to