Re: [PR] feat: Introduce ArrowArrayReader factory on FileScanTask [iceberg-cpp]

via GitHub Wed, 03 Sep 2025 01:08:01 -0700


wgtmac commented on code in PR #200:
URL: https://github.com/apache/iceberg-cpp/pull/200#discussion_r2313130243



##########
test/file_scan_task_test.cc:
##########
@@ -0,0 +1,168 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <arrow/array.h>
+#include <arrow/c/bridge.h>
+#include <arrow/json/from_string.h>
+#include <arrow/record_batch.h>
+#include <arrow/table.h>
+#include <arrow/util/key_value_metadata.h>
+#include <parquet/arrow/reader.h>
+#include <parquet/arrow/writer.h>
+#include <parquet/metadata.h>
+
+#include "iceberg/arrow/arrow_fs_file_io_internal.h"
+#include "iceberg/arrow_array_reader.h"
+#include "iceberg/file_format.h"
+#include "iceberg/manifest_entry.h"
+#include "iceberg/parquet/parquet_register.h"
+#include "iceberg/schema.h"
+#include "iceberg/table_scan.h"
+#include "iceberg/type.h"
+#include "iceberg/util/checked_cast.h"
+#include "matchers.h"

Review Comment:
   ```suggestion
   
   #include "matchers.h"
   ```



##########
src/iceberg/table_scan.h:
##########
@@ -185,4 +169,40 @@ class ICEBERG_EXPORT DataTableScan : public TableScan {
   Result<std::vector<std::shared_ptr<FileScanTask>>> PlanFiles() const 
override;
 };
 
+/// \brief Task representing a data file and its corresponding delete files.
+class ICEBERG_EXPORT FileScanTask : public ScanTask {
+ public:
+  explicit FileScanTask(std::shared_ptr<DataFile> data_file);
+
+  /// \brief The data file that should be read by this scan task.
+  const std::shared_ptr<DataFile>& data_file() const;
+
+  /// \brief The total size in bytes of the file split to be read.
+  int64_t size_bytes() const override;
+
+  /// \brief The number of files that should be read by this scan task.
+  int32_t files_count() const override;
+
+  /// \brief The number of rows that should be read by this scan task.
+  int64_t estimated_row_count() const override;
+
+  /**
+   * \brief Creates and returns an ArrowArrayReader to read the data for this 
task.
+   *
+   * This acts as a factory to instantiate a file-format-specific reader 
(e.g., Parquet)
+   * based on the metadata in this task and the provided context.
+   *
+   * \param context The table scan context, used to configure the reader 
(e.g., with the
+   * projected schema).
+   * \param io The FileIO instance for accessing the file data.
+   * \return A Result containing a unique pointer to the reader, or an error 
on failure.
+   */
+  Result<std::unique_ptr<ArrowArrayReader>> ToArrowArrayReader(

Review Comment:
   We should not use `TableScanContext` here since it is a transient object 
used inside TableScan. Downstreams no longer have access to it. So we need to 
pass required parameters one by one to this function.
   
   In this case, can you please move it back to the original location to 
minimize lines of change?



##########
src/iceberg/table_scan.cc:
##########
@@ -178,4 +170,29 @@ Result<std::vector<std::shared_ptr<FileScanTask>>> 
DataTableScan::PlanFiles() co
   return tasks;
 }
 
+FileScanTask::FileScanTask(std::shared_ptr<DataFile> data_file)

Review Comment:
   ditto, move them back to the original location.



##########
src/iceberg/file_reader.h:
##########
@@ -34,7 +35,7 @@
 namespace iceberg {
 
 /// \brief Base reader class to read data from different file formats.
-class ICEBERG_EXPORT Reader {
+class ICEBERG_EXPORT Reader : public ArrowArrayReader {
  public:
   virtual ~Reader() = default;

Review Comment:
   We can remove this dtor now.



##########
src/iceberg/file_reader.h:
##########
@@ -45,15 +46,15 @@ class ICEBERG_EXPORT Reader {
   virtual Status Open(const struct ReaderOptions& options) = 0;
 
   /// \brief Close the reader.
-  virtual Status Close() = 0;

Review Comment:
   We don't need to redefine this and below.



##########
src/iceberg/table_scan.cc:
##########
@@ -21,7 +21,11 @@
 
 #include <algorithm>
 #include <ranges>
+#include <utility>
 
+#include <iceberg/file_format.h>
+

Review Comment:
   ```suggestion
   ```



##########
test/CMakeLists.txt:
##########
@@ -122,5 +122,6 @@ if(ICEBERG_BUILD_BUNDLE)
                    SOURCES
                    parquet_data_test.cc
                    parquet_schema_test.cc
-                   parquet_test.cc)
+                   parquet_test.cc
+                   file_scan_task_test.cc)

Review Comment:
   Add a new test for table scan?
   
   ```
     add_iceberg_test(scan_test
                      USE_BUNDLE
                      SOURCES
                      file_scan_task_test.cc)
   ```



##########
src/iceberg/table_scan.h:
##########
@@ -185,4 +169,40 @@ class ICEBERG_EXPORT DataTableScan : public TableScan {
   Result<std::vector<std::shared_ptr<FileScanTask>>> PlanFiles() const 
override;
 };
 
+/// \brief Task representing a data file and its corresponding delete files.
+class ICEBERG_EXPORT FileScanTask : public ScanTask {
+ public:
+  explicit FileScanTask(std::shared_ptr<DataFile> data_file);
+
+  /// \brief The data file that should be read by this scan task.
+  const std::shared_ptr<DataFile>& data_file() const;
+
+  /// \brief The total size in bytes of the file split to be read.
+  int64_t size_bytes() const override;
+
+  /// \brief The number of files that should be read by this scan task.
+  int32_t files_count() const override;
+
+  /// \brief The number of rows that should be read by this scan task.
+  int64_t estimated_row_count() const override;
+
+  /**
+   * \brief Creates and returns an ArrowArrayReader to read the data for this 
task.

Review Comment:
   ```suggestion
      * \brief Returns an ArrowArrayReader to read the data for this task.
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] feat: Introduce ArrowArrayReader factory on FileScanTask [iceberg-cpp]

Reply via email to