wgtmac commented on code in PR #127: URL: https://github.com/apache/iceberg-cpp/pull/127#discussion_r2174062567
########## src/iceberg/file_reader.h: ########## @@ -29,6 +29,7 @@ #include "iceberg/arrow_c_data.h" #include "iceberg/file_format.h" +#include "iceberg/name_mapping.h" Review Comment: ```suggestion ``` We don't need this line because you've already used forward declaration below. ########## src/iceberg/avro/avro_reader.cc: ########## @@ -195,6 +204,156 @@ class AvroBatchReader::Impl { return arrow_array; } + // Apply field IDs to Avro schema nodes based on name mapping + Status ApplyFieldIdsFromNameMapping(const NameMapping& name_mapping, Review Comment: Can we move this to `avro_schema_util_internal.h` and `avro_schema_util.cc` so we can have better test coverage? ########## src/iceberg/avro/avro_reader.cc: ########## @@ -96,11 +99,17 @@ class AvroBatchReader::Impl { // Validate field ids in the file schema. HasIdVisitor has_id_visitor; ICEBERG_RETURN_UNEXPECTED(has_id_visitor.Visit(file_schema)); + if (has_id_visitor.HasNoIds()) { - // TODO(gangwu): support applying field-ids based on name mapping - return NotImplemented("Avro file schema has no field IDs"); - } - if (!has_id_visitor.AllHaveIds()) { + // Apply field IDs based on name mapping if available + if (options.name_mapping) { + ICEBERG_RETURN_UNEXPECTED(ApplyFieldIdsFromNameMapping(*options.name_mapping, Review Comment: Do we need to further check that all fields have applied field-ids? ########## src/iceberg/avro/avro_constants.h: ########## @@ -0,0 +1,40 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include <string_view> + +namespace iceberg::avro { Review Comment: Thanks for adding this! Can we rename it to `iceberg/avro/constants.h` to have a shorter name? ########## src/iceberg/avro/avro_reader.cc: ########## @@ -30,11 +30,14 @@ #include <avro/DataFile.hh> #include <avro/Generic.hh> #include <avro/GenericDatum.hh> +#include <nlohmann/json.hpp> Review Comment: ```suggestion ``` ########## test/avro_reader_test.cc: ########## @@ -0,0 +1,197 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include "iceberg/avro/avro_reader.h" + +#include <avro/Compiler.hh> +#include <avro/DataFile.hh> +#include <avro/Generic.hh> +#include <avro/GenericDatum.hh> +#include <avro/ValidSchema.hh> +#include <gmock/gmock.h> +#include <gtest/gtest.h> +#include <nlohmann/json.hpp> + +#include "iceberg/json_internal.h" +#include "iceberg/name_mapping.h" +#include "iceberg/result.h" +#include "iceberg/schema.h" +#include "matchers.h" + +namespace iceberg::avro { + +class AvroReaderTest : public ::testing::Test { Review Comment: These test cases are unrelated to this change or avro reader? If yes, please remove them. ########## src/iceberg/avro/avro_reader.cc: ########## @@ -96,11 +99,17 @@ class AvroBatchReader::Impl { // Validate field ids in the file schema. HasIdVisitor has_id_visitor; ICEBERG_RETURN_UNEXPECTED(has_id_visitor.Visit(file_schema)); + if (has_id_visitor.HasNoIds()) { - // TODO(gangwu): support applying field-ids based on name mapping - return NotImplemented("Avro file schema has no field IDs"); - } - if (!has_id_visitor.AllHaveIds()) { + // Apply field IDs based on name mapping if available + if (options.name_mapping) { + ICEBERG_RETURN_UNEXPECTED(ApplyFieldIdsFromNameMapping(*options.name_mapping, + file_schema.root().get())); + } else { + return NotImplemented( Review Comment: ```suggestion return InvalidSchema( ``` You've implemented this so it should be a new error now. ########## src/iceberg/avro/avro_reader.cc: ########## @@ -195,6 +209,147 @@ class AvroBatchReader::Impl { return arrow_array; } + // Apply field IDs to Avro schema nodes based on name mapping + Status ApplyFieldIdsFromNameMapping(const ::avro::NodePtr& node, + const NameMapping& name_mapping) { + switch (node->type()) { + case ::avro::AVRO_RECORD: + return ApplyFieldIdsToRecord(node, name_mapping); + case ::avro::AVRO_ARRAY: + return ApplyFieldIdsToArray(node, name_mapping); + case ::avro::AVRO_MAP: + return ApplyFieldIdsToMap(node, name_mapping); + case ::avro::AVRO_UNION: + return ApplyFieldIdsToUnion(node, name_mapping); + case ::avro::AVRO_BOOL: + case ::avro::AVRO_INT: + case ::avro::AVRO_LONG: + case ::avro::AVRO_FLOAT: + case ::avro::AVRO_DOUBLE: + case ::avro::AVRO_STRING: + case ::avro::AVRO_BYTES: + case ::avro::AVRO_FIXED: + return {}; + case ::avro::AVRO_NULL: + case ::avro::AVRO_ENUM: + default: + return InvalidSchema("Unsupported Avro type for field ID application: {}", + static_cast<int>(node->type())); Review Comment: There is a `std::string ToString(const ::avro::NodePtr& node)` in the `avro_schema_util_internal.h` to make it easier for better error message. ########## src/iceberg/avro/avro_reader.cc: ########## @@ -195,6 +204,156 @@ class AvroBatchReader::Impl { return arrow_array; } + // Apply field IDs to Avro schema nodes based on name mapping + Status ApplyFieldIdsFromNameMapping(const NameMapping& name_mapping, + ::avro::Node* node) { + switch (node->type()) { + case ::avro::AVRO_RECORD: + return ApplyFieldIdsToRecord(node, name_mapping); + case ::avro::AVRO_ARRAY: + return ApplyFieldIdsToArray(node, name_mapping); + case ::avro::AVRO_MAP: + return ApplyFieldIdsToMap(node, name_mapping); + case ::avro::AVRO_UNION: + return ApplyFieldIdsToUnion(node, name_mapping); + case ::avro::AVRO_BOOL: + case ::avro::AVRO_INT: + case ::avro::AVRO_LONG: + case ::avro::AVRO_FLOAT: + case ::avro::AVRO_DOUBLE: + case ::avro::AVRO_STRING: + case ::avro::AVRO_BYTES: + case ::avro::AVRO_FIXED: + return {}; + case ::avro::AVRO_NULL: + case ::avro::AVRO_ENUM: + default: + return InvalidSchema("Unsupported Avro type for field ID application: {}", + static_cast<int>(node->type())); + } + } + + Status ApplyFieldIdsToRecord(::avro::Node* node, const NameMapping& name_mapping) { + for (size_t i = 0; i < node->leaves(); ++i) { + const std::string& field_name = node->nameAt(i); + ::avro::Node* field_node = node->leafAt(i).get(); + + // Try to find field ID by name in the name mapping + if (auto field_ref = name_mapping.Find(field_name)) { + if (field_ref->get().field_id.has_value()) { + // Add field ID attribute to the node + ::avro::CustomAttributes attributes; + attributes.addAttribute(std::string(kFieldId), + std::to_string(field_ref->get().field_id.value()), + false); + node->addCustomAttributesForField(attributes); Review Comment: I think `addCustomAttributesForField` is not supposed to work like this. IIUC, `addCustomAttributesForField` is called for each sub-field of a parent node. It is impossible to in-place modify attributes of an `avro::Node`. We might need to create a new avro schema by merging original attributes with new ones. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org