rdblue commented on code in PR #6775:
URL: https://github.com/apache/iceberg/pull/6775#discussion_r1199488144


##########
python/pyiceberg/table/__init__.py:
##########
@@ -401,9 +423,38 @@ def plan_files(self) -> Iterator[FileScanTask]:
                             metrics_evaluator,
                         )
                         for manifest in manifests
+                        if (manifest.content is None or manifest.content == 
ManifestContent.DATA)
+                        or (
+                            # Not interested in deletes that are older than 
the data
+                            manifest.content == ManifestContent.DELETES
+                            and (manifest.sequence_number or 
INITIAL_SEQUENCE_NUMBER) >= min_sequence_number
+                        )
                     ],
                 )
+            ):
+                if datafile.content is None or datafile.content == 
DataFileContent.DATA:
+                    data_datafiles.append(datafile)
+                elif datafile.content == DataFileContent.POSITION_DELETES:
+                    deletes_positional.append(datafile)
+                elif datafile.content == DataFileContent.EQUALITY_DELETES:
+                    raise ValueError(
+                        "PyIceberg does not yet support equality deletes: 
https://github.com/apache/iceberg/issues/6568";
+                    )
+                else:
+                    raise ValueError(f"Unknown DataFileContent: 
{datafile.content}")
+
+        return [
+            FileScanTask(data_file, 
delete_files=self._match_deletes_to_datafile(data_file, deletes_positional))
+            for data_file in data_datafiles
+        ]
+
+    def _match_deletes_to_datafile(self, data_file: DataFile, 
positional_delete_files: List[DataFile]) -> Set[DataFile]:
+        return set(

Review Comment:
   I think this is going to over-match delete files to data files. An older 
delete file could match a new data file because the range of `file_path` could 
be large and not helpful.
   
   I think all you need to do is to check this preliminary set by comparing the 
sequence number of the data file and the matching delete files.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to