wgtmac commented on code in PR #127:
URL: https://github.com/apache/iceberg-cpp/pull/127#discussion_r2174062567


##########
src/iceberg/file_reader.h:
##########
@@ -29,6 +29,7 @@
 
 #include "iceberg/arrow_c_data.h"
 #include "iceberg/file_format.h"
+#include "iceberg/name_mapping.h"

Review Comment:
   ```suggestion
   ```
   
   We don't need this line because you've already used forward declaration 
below.



##########
src/iceberg/avro/avro_reader.cc:
##########
@@ -195,6 +204,156 @@ class AvroBatchReader::Impl {
     return arrow_array;
   }
 
+  // Apply field IDs to Avro schema nodes based on name mapping
+  Status ApplyFieldIdsFromNameMapping(const NameMapping& name_mapping,

Review Comment:
   Can we move this to `avro_schema_util_internal.h` and `avro_schema_util.cc` 
so we can have better test coverage?



##########
src/iceberg/avro/avro_reader.cc:
##########
@@ -96,11 +99,17 @@ class AvroBatchReader::Impl {
     // Validate field ids in the file schema.
     HasIdVisitor has_id_visitor;
     ICEBERG_RETURN_UNEXPECTED(has_id_visitor.Visit(file_schema));
+
     if (has_id_visitor.HasNoIds()) {
-      // TODO(gangwu): support applying field-ids based on name mapping
-      return NotImplemented("Avro file schema has no field IDs");
-    }
-    if (!has_id_visitor.AllHaveIds()) {
+      // Apply field IDs based on name mapping if available
+      if (options.name_mapping) {
+        
ICEBERG_RETURN_UNEXPECTED(ApplyFieldIdsFromNameMapping(*options.name_mapping,

Review Comment:
   Do we need to further check that all fields have applied field-ids?



##########
src/iceberg/avro/avro_constants.h:
##########
@@ -0,0 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#pragma once
+
+#include <string_view>
+
+namespace iceberg::avro {

Review Comment:
   Thanks for adding this! Can we rename it to `iceberg/avro/constants.h` to 
have a shorter name?



##########
src/iceberg/avro/avro_reader.cc:
##########
@@ -30,11 +30,14 @@
 #include <avro/DataFile.hh>
 #include <avro/Generic.hh>
 #include <avro/GenericDatum.hh>
+#include <nlohmann/json.hpp>

Review Comment:
   ```suggestion
   ```



##########
test/avro_reader_test.cc:
##########
@@ -0,0 +1,197 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "iceberg/avro/avro_reader.h"
+
+#include <avro/Compiler.hh>
+#include <avro/DataFile.hh>
+#include <avro/Generic.hh>
+#include <avro/GenericDatum.hh>
+#include <avro/ValidSchema.hh>
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include <nlohmann/json.hpp>
+
+#include "iceberg/json_internal.h"
+#include "iceberg/name_mapping.h"
+#include "iceberg/result.h"
+#include "iceberg/schema.h"
+#include "matchers.h"
+
+namespace iceberg::avro {
+
+class AvroReaderTest : public ::testing::Test {

Review Comment:
   These test cases are unrelated to this change or avro reader? If yes, please 
remove them.



##########
src/iceberg/avro/avro_reader.cc:
##########
@@ -96,11 +99,17 @@ class AvroBatchReader::Impl {
     // Validate field ids in the file schema.
     HasIdVisitor has_id_visitor;
     ICEBERG_RETURN_UNEXPECTED(has_id_visitor.Visit(file_schema));
+
     if (has_id_visitor.HasNoIds()) {
-      // TODO(gangwu): support applying field-ids based on name mapping
-      return NotImplemented("Avro file schema has no field IDs");
-    }
-    if (!has_id_visitor.AllHaveIds()) {
+      // Apply field IDs based on name mapping if available
+      if (options.name_mapping) {
+        
ICEBERG_RETURN_UNEXPECTED(ApplyFieldIdsFromNameMapping(*options.name_mapping,
+                                                               
file_schema.root().get()));
+      } else {
+        return NotImplemented(

Review Comment:
   ```suggestion
           return InvalidSchema(
   ```
   
   You've implemented this so it should be a new error now.



##########
src/iceberg/avro/avro_reader.cc:
##########
@@ -195,6 +209,147 @@ class AvroBatchReader::Impl {
     return arrow_array;
   }
 
+  // Apply field IDs to Avro schema nodes based on name mapping
+  Status ApplyFieldIdsFromNameMapping(const ::avro::NodePtr& node,
+                                      const NameMapping& name_mapping) {
+    switch (node->type()) {
+      case ::avro::AVRO_RECORD:
+        return ApplyFieldIdsToRecord(node, name_mapping);
+      case ::avro::AVRO_ARRAY:
+        return ApplyFieldIdsToArray(node, name_mapping);
+      case ::avro::AVRO_MAP:
+        return ApplyFieldIdsToMap(node, name_mapping);
+      case ::avro::AVRO_UNION:
+        return ApplyFieldIdsToUnion(node, name_mapping);
+      case ::avro::AVRO_BOOL:
+      case ::avro::AVRO_INT:
+      case ::avro::AVRO_LONG:
+      case ::avro::AVRO_FLOAT:
+      case ::avro::AVRO_DOUBLE:
+      case ::avro::AVRO_STRING:
+      case ::avro::AVRO_BYTES:
+      case ::avro::AVRO_FIXED:
+        return {};
+      case ::avro::AVRO_NULL:
+      case ::avro::AVRO_ENUM:
+      default:
+        return InvalidSchema("Unsupported Avro type for field ID application: 
{}",
+                             static_cast<int>(node->type()));

Review Comment:
   There is a `std::string ToString(const ::avro::NodePtr& node)` in the 
`avro_schema_util_internal.h` to make it easier for better error message.



##########
src/iceberg/avro/avro_reader.cc:
##########
@@ -195,6 +204,156 @@ class AvroBatchReader::Impl {
     return arrow_array;
   }
 
+  // Apply field IDs to Avro schema nodes based on name mapping
+  Status ApplyFieldIdsFromNameMapping(const NameMapping& name_mapping,
+                                      ::avro::Node* node) {
+    switch (node->type()) {
+      case ::avro::AVRO_RECORD:
+        return ApplyFieldIdsToRecord(node, name_mapping);
+      case ::avro::AVRO_ARRAY:
+        return ApplyFieldIdsToArray(node, name_mapping);
+      case ::avro::AVRO_MAP:
+        return ApplyFieldIdsToMap(node, name_mapping);
+      case ::avro::AVRO_UNION:
+        return ApplyFieldIdsToUnion(node, name_mapping);
+      case ::avro::AVRO_BOOL:
+      case ::avro::AVRO_INT:
+      case ::avro::AVRO_LONG:
+      case ::avro::AVRO_FLOAT:
+      case ::avro::AVRO_DOUBLE:
+      case ::avro::AVRO_STRING:
+      case ::avro::AVRO_BYTES:
+      case ::avro::AVRO_FIXED:
+        return {};
+      case ::avro::AVRO_NULL:
+      case ::avro::AVRO_ENUM:
+      default:
+        return InvalidSchema("Unsupported Avro type for field ID application: 
{}",
+                             static_cast<int>(node->type()));
+    }
+  }
+
+  Status ApplyFieldIdsToRecord(::avro::Node* node, const NameMapping& 
name_mapping) {
+    for (size_t i = 0; i < node->leaves(); ++i) {
+      const std::string& field_name = node->nameAt(i);
+      ::avro::Node* field_node = node->leafAt(i).get();
+
+      // Try to find field ID by name in the name mapping
+      if (auto field_ref = name_mapping.Find(field_name)) {
+        if (field_ref->get().field_id.has_value()) {
+          // Add field ID attribute to the node
+          ::avro::CustomAttributes attributes;
+          attributes.addAttribute(std::string(kFieldId),
+                                  
std::to_string(field_ref->get().field_id.value()),
+                                  false);
+          node->addCustomAttributesForField(attributes);

Review Comment:
   I think `addCustomAttributesForField` is not supposed to work like this. 
IIUC, `addCustomAttributesForField` is called for each sub-field of a parent 
node. It is impossible to in-place modify attributes of an `avro::Node`. We 
might need to create a new avro schema by merging original attributes with new 
ones.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org
For additional commands, e-mail: issues-h...@iceberg.apache.org

Reply via email to