Re: [PR] Add plan tasks for TableScan [iceberg-python]

via GitHub Mon, 16 Dec 2024 01:25:57 -0800


ConeyLiu commented on code in PR #1427:
URL: https://github.com/apache/iceberg-python/pull/1427#discussion_r1886465895



##########
tests/integration/test_reads.py:
##########
@@ -873,3 +874,76 @@ def test_table_scan_empty_table(catalog: Catalog) -> None:
     result_table = tbl.scan().to_arrow()
 
     assert len(result_table) == 0
+
+
+@pytest.mark.integration
+def test_plan_tasks(session_catalog: Catalog) -> None:
+    from pyiceberg.table import TableProperties
+
+    table_name = "default.test_plan_tasks"
+    try:
+        session_catalog.drop_table(table_name)
+    except NoSuchTableError:
+        pass  # Just to make sure that the table doesn't exist
+
+    tbl = session_catalog.create_table(
+        table_name,
+        Schema(
+            NestedField(1, "number", LongType()),
+        ),
+        properties={TableProperties.PARQUET_ROW_GROUP_LIMIT: "1"},
+    )
+
+    # Write 10 row groups, that should end up as 10 batches
+    entries = 10
+    tbl.append(
+        pa.Table.from_pylist(
+            [
+                {
+                    "number": number,
+                }
+                for number in range(entries)
+            ],
+        )
+    )
+
+    assert len(tbl.inspect.files()) == 1
+
+    plan_files = list(tbl.scan().plan_files())
+    assert len(plan_files) == 1
+    data_file = plan_files[0].file
+    assert data_file.split_offsets is not None and 
len(data_file.split_offsets) == 10
+
+    plan_tasks = list(tbl.scan(options={TableProperties.READ_SPLIT_SIZE: 
1}).plan_task())
+    assert len(plan_tasks) == 10
+
+    split_offsets = []
+    for task in plan_tasks:
+        assert len(task.tasks) == 1
+        split_offsets.append(task.tasks[0].start)
+
+    assert split_offsets == plan_files[0].file.split_offsets
+
+    split_sizes = []
+    for i in range(1, len(data_file.split_offsets)):
+        split_sizes.append(data_file.split_offsets[i] - 
data_file.split_offsets[i - 1])
+
+    split_sizes.append(data_file.file_size_in_bytes - 
data_file.split_offsets[-1])
+
+    read_split_size = int(data_file.file_size_in_bytes / 4)
+    read_split_open_file_cost = 1
+    read_split_lookback = 5
+
+    plan_tasks = list(

Review Comment:
   @kevinjqliu added more tests here, pls give more advice.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org
For additional commands, e-mail: issues-h...@iceberg.apache.org

Re: [PR] Add plan tasks for TableScan [iceberg-python]

Reply via email to