wgtmac commented on code in PR #200: URL: https://github.com/apache/iceberg-cpp/pull/200#discussion_r2313130243
########## test/file_scan_task_test.cc: ########## @@ -0,0 +1,168 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include <arrow/array.h> +#include <arrow/c/bridge.h> +#include <arrow/json/from_string.h> +#include <arrow/record_batch.h> +#include <arrow/table.h> +#include <arrow/util/key_value_metadata.h> +#include <parquet/arrow/reader.h> +#include <parquet/arrow/writer.h> +#include <parquet/metadata.h> + +#include "iceberg/arrow/arrow_fs_file_io_internal.h" +#include "iceberg/arrow_array_reader.h" +#include "iceberg/file_format.h" +#include "iceberg/manifest_entry.h" +#include "iceberg/parquet/parquet_register.h" +#include "iceberg/schema.h" +#include "iceberg/table_scan.h" +#include "iceberg/type.h" +#include "iceberg/util/checked_cast.h" +#include "matchers.h" Review Comment: ```suggestion #include "matchers.h" ``` ########## src/iceberg/table_scan.h: ########## @@ -185,4 +169,40 @@ class ICEBERG_EXPORT DataTableScan : public TableScan { Result<std::vector<std::shared_ptr<FileScanTask>>> PlanFiles() const override; }; +/// \brief Task representing a data file and its corresponding delete files. +class ICEBERG_EXPORT FileScanTask : public ScanTask { + public: + explicit FileScanTask(std::shared_ptr<DataFile> data_file); + + /// \brief The data file that should be read by this scan task. + const std::shared_ptr<DataFile>& data_file() const; + + /// \brief The total size in bytes of the file split to be read. + int64_t size_bytes() const override; + + /// \brief The number of files that should be read by this scan task. + int32_t files_count() const override; + + /// \brief The number of rows that should be read by this scan task. + int64_t estimated_row_count() const override; + + /** + * \brief Creates and returns an ArrowArrayReader to read the data for this task. + * + * This acts as a factory to instantiate a file-format-specific reader (e.g., Parquet) + * based on the metadata in this task and the provided context. + * + * \param context The table scan context, used to configure the reader (e.g., with the + * projected schema). + * \param io The FileIO instance for accessing the file data. + * \return A Result containing a unique pointer to the reader, or an error on failure. + */ + Result<std::unique_ptr<ArrowArrayReader>> ToArrowArrayReader( Review Comment: We should not use `TableScanContext` here since it is a transient object used inside TableScan. Downstreams no longer have access to it. So we need to pass required parameters one by one to this function. In this case, can you please move it back to the original location to minimize lines of change? ########## src/iceberg/table_scan.cc: ########## @@ -178,4 +170,29 @@ Result<std::vector<std::shared_ptr<FileScanTask>>> DataTableScan::PlanFiles() co return tasks; } +FileScanTask::FileScanTask(std::shared_ptr<DataFile> data_file) Review Comment: ditto, move them back to the original location. ########## src/iceberg/file_reader.h: ########## @@ -34,7 +35,7 @@ namespace iceberg { /// \brief Base reader class to read data from different file formats. -class ICEBERG_EXPORT Reader { +class ICEBERG_EXPORT Reader : public ArrowArrayReader { public: virtual ~Reader() = default; Review Comment: We can remove this dtor now. ########## src/iceberg/file_reader.h: ########## @@ -45,15 +46,15 @@ class ICEBERG_EXPORT Reader { virtual Status Open(const struct ReaderOptions& options) = 0; /// \brief Close the reader. - virtual Status Close() = 0; Review Comment: We don't need to redefine this and below. ########## src/iceberg/table_scan.cc: ########## @@ -21,7 +21,11 @@ #include <algorithm> #include <ranges> +#include <utility> +#include <iceberg/file_format.h> + Review Comment: ```suggestion ``` ########## test/CMakeLists.txt: ########## @@ -122,5 +122,6 @@ if(ICEBERG_BUILD_BUNDLE) SOURCES parquet_data_test.cc parquet_schema_test.cc - parquet_test.cc) + parquet_test.cc + file_scan_task_test.cc) Review Comment: Add a new test for table scan? ``` add_iceberg_test(scan_test USE_BUNDLE SOURCES file_scan_task_test.cc) ``` ########## src/iceberg/table_scan.h: ########## @@ -185,4 +169,40 @@ class ICEBERG_EXPORT DataTableScan : public TableScan { Result<std::vector<std::shared_ptr<FileScanTask>>> PlanFiles() const override; }; +/// \brief Task representing a data file and its corresponding delete files. +class ICEBERG_EXPORT FileScanTask : public ScanTask { + public: + explicit FileScanTask(std::shared_ptr<DataFile> data_file); + + /// \brief The data file that should be read by this scan task. + const std::shared_ptr<DataFile>& data_file() const; + + /// \brief The total size in bytes of the file split to be read. + int64_t size_bytes() const override; + + /// \brief The number of files that should be read by this scan task. + int32_t files_count() const override; + + /// \brief The number of rows that should be read by this scan task. + int64_t estimated_row_count() const override; + + /** + * \brief Creates and returns an ArrowArrayReader to read the data for this task. Review Comment: ```suggestion * \brief Returns an ArrowArrayReader to read the data for this task. ``` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected] --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
