wgtmac commented on code in PR #127: URL: https://github.com/apache/iceberg-cpp/pull/127#discussion_r2220839922
########## test/avro_schema_test.cc: ########## @@ -1057,4 +1059,366 @@ TEST(AvroSchemaProjectionTest, ProjectDecimalIncompatible) { ASSERT_THAT(projection_result, HasErrorMessage("Cannot read")); } +// NameMapping tests for Avro schema context +class NameMappingAvroSchemaTest : public ::testing::Test { + protected: + // Helper function to create a simple name mapping + std::unique_ptr<NameMapping> CreateSimpleNameMapping() { + std::vector<MappedField> fields; + fields.emplace_back(MappedField{.names = {"id"}, .field_id = 1}); + fields.emplace_back(MappedField{.names = {"name"}, .field_id = 2}); + fields.emplace_back(MappedField{.names = {"age"}, .field_id = 3}); + return NameMapping::Make(std::move(fields)); + } + + // Helper function to create a nested name mapping + std::unique_ptr<NameMapping> CreateNestedNameMapping() { + std::vector<MappedField> fields; + fields.emplace_back(MappedField{.names = {"id"}, .field_id = 1}); + + // Nested mapping for address + std::vector<MappedField> address_fields; + address_fields.emplace_back(MappedField{.names = {"street"}, .field_id = 10}); + address_fields.emplace_back(MappedField{.names = {"city"}, .field_id = 11}); + address_fields.emplace_back(MappedField{.names = {"zip"}, .field_id = 12}); + auto address_mapping = MappedFields::Make(std::move(address_fields)); + + fields.emplace_back(MappedField{.names = {"address"}, + .field_id = 2, + .nested_mapping = std::move(address_mapping)}); + + return NameMapping::Make(std::move(fields)); + } + + // Helper function to create a name mapping for array types + std::unique_ptr<NameMapping> CreateArrayNameMapping() { + std::vector<MappedField> fields; + fields.emplace_back(MappedField{.names = {"id"}, .field_id = 1}); + + // Nested mapping for array element + std::vector<MappedField> element_fields; + element_fields.emplace_back(MappedField{.names = {"element"}, .field_id = 20}); + auto element_mapping = MappedFields::Make(std::move(element_fields)); + + fields.emplace_back(MappedField{ + .names = {"items"}, .field_id = 2, .nested_mapping = std::move(element_mapping)}); + + return NameMapping::Make(std::move(fields)); + } + + // Helper function to create a name mapping for map types + std::unique_ptr<NameMapping> CreateMapNameMapping() { + std::vector<MappedField> fields; + fields.emplace_back(MappedField{.names = {"id"}, .field_id = 1}); + + // Nested mapping for map key-value + std::vector<MappedField> map_fields; + map_fields.emplace_back(MappedField{.names = {"key"}, .field_id = 30}); + map_fields.emplace_back(MappedField{.names = {"value"}, .field_id = 31}); + auto map_mapping = MappedFields::Make(std::move(map_fields)); + + fields.emplace_back(MappedField{.names = {"properties"}, + .field_id = 2, + .nested_mapping = std::move(map_mapping)}); + + return NameMapping::Make(std::move(fields)); + } + + // Helper function to create a name mapping for union types + std::unique_ptr<NameMapping> CreateUnionNameMapping() { + std::vector<MappedField> fields; + fields.emplace_back(MappedField{.names = {"id"}, .field_id = 1}); + fields.emplace_back(MappedField{.names = {"data"}, .field_id = 2}); + return NameMapping::Make(std::move(fields)); + } +}; + +TEST_F(NameMappingAvroSchemaTest, ApplyNameMappingToRecord) { + // Create a simple Avro record schema without field IDs + std::string avro_schema_json = R"({ + "type": "record", + "name": "test_record", + "fields": [ + {"name": "id", "type": "int"}, + {"name": "name", "type": "string"}, + {"name": "age", "type": "int"} + ] + })"; + auto avro_schema = ::avro::compileJsonSchemaFromString(avro_schema_json); + + auto name_mapping = CreateSimpleNameMapping(); + + auto result = CreateAvroNodeWithFieldIds(avro_schema.root(), *name_mapping); + ASSERT_THAT(result, IsOk()); + + const auto& node = *result; + EXPECT_EQ(node->type(), ::avro::AVRO_RECORD); + EXPECT_EQ(node->names(), 3); + EXPECT_EQ(node->leaves(), 3); + + // Check that field IDs are properly applied + ASSERT_EQ(node->customAttributes(), 3); + ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(node, /*index=*/0, /*field_id=*/1)); + ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(node, /*index=*/1, /*field_id=*/2)); + ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(node, /*index=*/2, /*field_id=*/3)); +} + +TEST_F(NameMappingAvroSchemaTest, ApplyNameMappingToNestedRecord) { + // Create a nested Avro record schema without field IDs + std::string avro_schema_json = R"({ + "type": "record", + "name": "test_record", + "fields": [ + {"name": "id", "type": "int"}, + {"name": "address", "type": { + "type": "record", + "name": "address", + "fields": [ + {"name": "street", "type": "string"}, + {"name": "city", "type": "string"}, + {"name": "zip", "type": "string"} + ] + }} + ] + })"; + auto avro_schema = ::avro::compileJsonSchemaFromString(avro_schema_json); + + auto name_mapping = CreateNestedNameMapping(); + + auto result = CreateAvroNodeWithFieldIds(avro_schema.root(), *name_mapping); + ASSERT_THAT(result, IsOk()); + + const auto& node = *result; + EXPECT_EQ(node->type(), ::avro::AVRO_RECORD); + EXPECT_EQ(node->names(), 2); + EXPECT_EQ(node->leaves(), 2); + + // Check that field IDs are properly applied to top-level fields + ASSERT_EQ(node->customAttributes(), 2); + ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(node, /*index=*/0, /*field_id=*/1)); + ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(node, /*index=*/1, /*field_id=*/2)); + + // Check nested record + const auto& address_node = node->leafAt(1); + EXPECT_EQ(address_node->type(), ::avro::AVRO_RECORD); + EXPECT_EQ(address_node->names(), 3); + EXPECT_EQ(address_node->leaves(), 3); + + // Check that field IDs are properly applied to nested fields + ASSERT_EQ(address_node->customAttributes(), 3); + ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(address_node, /*index=*/0, /*field_id=*/10)); + ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(address_node, /*index=*/1, /*field_id=*/11)); + ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(address_node, /*index=*/2, /*field_id=*/12)); +} + +TEST_F(NameMappingAvroSchemaTest, ApplyNameMappingToArray) { + // Create an Avro array schema without field IDs + std::string avro_schema_json = R"({ + "type": "record", + "name": "test_record", + "fields": [ + {"name": "id", "type": "int"}, + {"name": "items", "type": { + "type": "array", + "items": "string" + }} + ] + })"; + auto avro_schema = ::avro::compileJsonSchemaFromString(avro_schema_json); + + auto name_mapping = CreateArrayNameMapping(); + + auto result = CreateAvroNodeWithFieldIds(avro_schema.root(), *name_mapping); + ASSERT_THAT(result, IsOk()); + + const auto& new_node = *result; + EXPECT_EQ(new_node->type(), ::avro::AVRO_RECORD); + EXPECT_EQ(new_node->names(), 2); + EXPECT_EQ(new_node->leaves(), 2); + + // Check that field IDs are properly applied to top-level fields + ASSERT_EQ(new_node->customAttributes(), 2); + ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(new_node, /*index=*/0, /*field_id=*/1)); + ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(new_node, /*index=*/1, /*field_id=*/2)); + + // Check array field structure and element field ID + const auto& array_node = new_node->leafAt(1); + EXPECT_EQ(array_node->type(), ::avro::AVRO_ARRAY); + EXPECT_EQ(array_node->leaves(), 1); + + // Check that array element has field ID applied + const auto& element_node = array_node->leafAt(0); + EXPECT_EQ(element_node->type(), ::avro::AVRO_STRING); Review Comment: ```suggestion EXPECT_EQ(element_node->type(), ::avro::AVRO_STRING); ASSERT_EQ(array_node->customAttributes(), 1); ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(array_node, /*index=*/0, /*field_id=*/20)); ``` ########## src/iceberg/avro/avro_schema_util.cc: ########## @@ -783,4 +785,237 @@ Result<SchemaProjection> Project(const Schema& expected_schema, return SchemaProjection{std::move(field_projection.children)}; } +namespace { + +Result<::avro::NodePtr> CreateRecordNodeWithFieldIds(const ::avro::NodePtr& original_node, + const MappedField& field) { + auto new_record_node = std::make_shared<::avro::NodeRecord>(); + new_record_node->setName(original_node->name()); + + if (original_node->leaves() > original_node->names()) { + return InvalidSchema("Node has {} leaves but only {} names", original_node->leaves(), + original_node->names()); + } + + for (size_t i = 0; i < original_node->leaves(); ++i) { + const std::string& field_name = original_node->nameAt(i); + ::avro::NodePtr field_node = original_node->leafAt(i); + + // TODO(liuxiaoyu): Add support for case sensitivity in name matching. + // Try to find nested field by name + const MappedField* nested_field = nullptr; + if (field.nested_mapping) { + auto fields_span = field.nested_mapping->fields(); + for (const auto& f : fields_span) { + if (f.names.find(field_name) != f.names.end()) { + nested_field = &f; + break; + } + } + } + + if (nested_field) { + // Check if field_id is present + if (!nested_field->field_id.has_value()) { + return InvalidSchema("Field ID is missing for field '{}' in nested mapping", + field_name); + } + + // Preserve existing custom attributes for this field + ::avro::CustomAttributes attributes; + if (i < original_node->customAttributes()) { + // Copy all existing attributes from the original node + const auto& original_attrs = original_node->customAttributesAt(i); + const auto& existing_attrs = original_attrs.attributes(); + for (const auto& attr_pair : existing_attrs) { + attributes.addAttribute(attr_pair.first, attr_pair.second, false); Review Comment: Please add this comment. ########## src/iceberg/avro/avro_reader.cc: ########## @@ -96,16 +97,29 @@ class AvroReader::Impl { // Create a base reader without setting reader schema to enable projection. auto base_reader = std::make_unique<::avro::DataFileReaderBase>(std::move(input_stream)); - const ::avro::ValidSchema& file_schema = base_reader->dataSchema(); + ::avro::ValidSchema file_schema = base_reader->dataSchema(); // Validate field ids in the file schema. HasIdVisitor has_id_visitor; ICEBERG_RETURN_UNEXPECTED(has_id_visitor.Visit(file_schema)); + if (has_id_visitor.HasNoIds()) { - // TODO(gangwu): support applying field-ids based on name mapping - return NotImplemented("Avro file schema has no field IDs"); - } - if (!has_id_visitor.AllHaveIds()) { + // Apply field IDs based on name mapping if available + if (options.name_mapping) { + ICEBERG_ASSIGN_OR_RAISE( + auto new_root_node, + CreateAvroNodeWithFieldIds(file_schema.root(), *options.name_mapping)); + + // Create a new schema with the updated root node + auto new_schema = ::avro::ValidSchema(new_root_node); + + // Update the file schema to use the new schema with field IDs + file_schema = new_schema; Review Comment: ```suggestion file_schema = ::avro::ValidSchema(new_root_node); ``` We don't need a temporary variable. ########## test/avro_schema_test.cc: ########## @@ -1057,4 +1059,366 @@ TEST(AvroSchemaProjectionTest, ProjectDecimalIncompatible) { ASSERT_THAT(projection_result, HasErrorMessage("Cannot read")); } +// NameMapping tests for Avro schema context +class NameMappingAvroSchemaTest : public ::testing::Test { + protected: + // Helper function to create a simple name mapping + std::unique_ptr<NameMapping> CreateSimpleNameMapping() { + std::vector<MappedField> fields; + fields.emplace_back(MappedField{.names = {"id"}, .field_id = 1}); + fields.emplace_back(MappedField{.names = {"name"}, .field_id = 2}); + fields.emplace_back(MappedField{.names = {"age"}, .field_id = 3}); + return NameMapping::Make(std::move(fields)); + } + + // Helper function to create a nested name mapping + std::unique_ptr<NameMapping> CreateNestedNameMapping() { + std::vector<MappedField> fields; + fields.emplace_back(MappedField{.names = {"id"}, .field_id = 1}); + + // Nested mapping for address + std::vector<MappedField> address_fields; + address_fields.emplace_back(MappedField{.names = {"street"}, .field_id = 10}); + address_fields.emplace_back(MappedField{.names = {"city"}, .field_id = 11}); + address_fields.emplace_back(MappedField{.names = {"zip"}, .field_id = 12}); + auto address_mapping = MappedFields::Make(std::move(address_fields)); + + fields.emplace_back(MappedField{.names = {"address"}, + .field_id = 2, + .nested_mapping = std::move(address_mapping)}); + + return NameMapping::Make(std::move(fields)); + } + + // Helper function to create a name mapping for array types + std::unique_ptr<NameMapping> CreateArrayNameMapping() { + std::vector<MappedField> fields; + fields.emplace_back(MappedField{.names = {"id"}, .field_id = 1}); + + // Nested mapping for array element + std::vector<MappedField> element_fields; + element_fields.emplace_back(MappedField{.names = {"element"}, .field_id = 20}); + auto element_mapping = MappedFields::Make(std::move(element_fields)); + + fields.emplace_back(MappedField{ + .names = {"items"}, .field_id = 2, .nested_mapping = std::move(element_mapping)}); + + return NameMapping::Make(std::move(fields)); + } + + // Helper function to create a name mapping for map types + std::unique_ptr<NameMapping> CreateMapNameMapping() { + std::vector<MappedField> fields; + fields.emplace_back(MappedField{.names = {"id"}, .field_id = 1}); + + // Nested mapping for map key-value + std::vector<MappedField> map_fields; + map_fields.emplace_back(MappedField{.names = {"key"}, .field_id = 30}); + map_fields.emplace_back(MappedField{.names = {"value"}, .field_id = 31}); + auto map_mapping = MappedFields::Make(std::move(map_fields)); + + fields.emplace_back(MappedField{.names = {"properties"}, + .field_id = 2, + .nested_mapping = std::move(map_mapping)}); + + return NameMapping::Make(std::move(fields)); + } + + // Helper function to create a name mapping for union types + std::unique_ptr<NameMapping> CreateUnionNameMapping() { + std::vector<MappedField> fields; + fields.emplace_back(MappedField{.names = {"id"}, .field_id = 1}); + fields.emplace_back(MappedField{.names = {"data"}, .field_id = 2}); + return NameMapping::Make(std::move(fields)); + } +}; + +TEST_F(NameMappingAvroSchemaTest, ApplyNameMappingToRecord) { + // Create a simple Avro record schema without field IDs + std::string avro_schema_json = R"({ + "type": "record", + "name": "test_record", + "fields": [ + {"name": "id", "type": "int"}, + {"name": "name", "type": "string"}, + {"name": "age", "type": "int"} + ] + })"; + auto avro_schema = ::avro::compileJsonSchemaFromString(avro_schema_json); + + auto name_mapping = CreateSimpleNameMapping(); + + auto result = CreateAvroNodeWithFieldIds(avro_schema.root(), *name_mapping); + ASSERT_THAT(result, IsOk()); + + const auto& node = *result; + EXPECT_EQ(node->type(), ::avro::AVRO_RECORD); + EXPECT_EQ(node->names(), 3); + EXPECT_EQ(node->leaves(), 3); + + // Check that field IDs are properly applied + ASSERT_EQ(node->customAttributes(), 3); + ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(node, /*index=*/0, /*field_id=*/1)); + ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(node, /*index=*/1, /*field_id=*/2)); + ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(node, /*index=*/2, /*field_id=*/3)); +} + +TEST_F(NameMappingAvroSchemaTest, ApplyNameMappingToNestedRecord) { + // Create a nested Avro record schema without field IDs + std::string avro_schema_json = R"({ + "type": "record", + "name": "test_record", + "fields": [ + {"name": "id", "type": "int"}, + {"name": "address", "type": { + "type": "record", + "name": "address", + "fields": [ + {"name": "street", "type": "string"}, + {"name": "city", "type": "string"}, + {"name": "zip", "type": "string"} + ] + }} + ] + })"; + auto avro_schema = ::avro::compileJsonSchemaFromString(avro_schema_json); + + auto name_mapping = CreateNestedNameMapping(); + + auto result = CreateAvroNodeWithFieldIds(avro_schema.root(), *name_mapping); + ASSERT_THAT(result, IsOk()); + + const auto& node = *result; + EXPECT_EQ(node->type(), ::avro::AVRO_RECORD); + EXPECT_EQ(node->names(), 2); + EXPECT_EQ(node->leaves(), 2); + + // Check that field IDs are properly applied to top-level fields + ASSERT_EQ(node->customAttributes(), 2); + ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(node, /*index=*/0, /*field_id=*/1)); + ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(node, /*index=*/1, /*field_id=*/2)); + + // Check nested record + const auto& address_node = node->leafAt(1); + EXPECT_EQ(address_node->type(), ::avro::AVRO_RECORD); + EXPECT_EQ(address_node->names(), 3); + EXPECT_EQ(address_node->leaves(), 3); + + // Check that field IDs are properly applied to nested fields + ASSERT_EQ(address_node->customAttributes(), 3); + ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(address_node, /*index=*/0, /*field_id=*/10)); + ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(address_node, /*index=*/1, /*field_id=*/11)); + ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(address_node, /*index=*/2, /*field_id=*/12)); +} + +TEST_F(NameMappingAvroSchemaTest, ApplyNameMappingToArray) { + // Create an Avro array schema without field IDs + std::string avro_schema_json = R"({ + "type": "record", + "name": "test_record", + "fields": [ + {"name": "id", "type": "int"}, + {"name": "items", "type": { + "type": "array", + "items": "string" + }} + ] + })"; + auto avro_schema = ::avro::compileJsonSchemaFromString(avro_schema_json); + + auto name_mapping = CreateArrayNameMapping(); + + auto result = CreateAvroNodeWithFieldIds(avro_schema.root(), *name_mapping); + ASSERT_THAT(result, IsOk()); + + const auto& new_node = *result; + EXPECT_EQ(new_node->type(), ::avro::AVRO_RECORD); + EXPECT_EQ(new_node->names(), 2); + EXPECT_EQ(new_node->leaves(), 2); + + // Check that field IDs are properly applied to top-level fields + ASSERT_EQ(new_node->customAttributes(), 2); + ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(new_node, /*index=*/0, /*field_id=*/1)); + ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(new_node, /*index=*/1, /*field_id=*/2)); + + // Check array field structure and element field ID + const auto& array_node = new_node->leafAt(1); + EXPECT_EQ(array_node->type(), ::avro::AVRO_ARRAY); + EXPECT_EQ(array_node->leaves(), 1); + + // Check that array element has field ID applied + const auto& element_node = array_node->leafAt(0); + EXPECT_EQ(element_node->type(), ::avro::AVRO_STRING); +} + +TEST_F(NameMappingAvroSchemaTest, ApplyNameMappingToMap) { + // Create an Avro map schema without field IDs + std::string avro_schema_json = R"({ + "type": "record", + "name": "test_record", + "fields": [ + {"name": "id", "type": "int"}, + {"name": "properties", "type": { + "type": "map", + "values": "string" + }} + ] + })"; + auto avro_schema = ::avro::compileJsonSchemaFromString(avro_schema_json); + + auto name_mapping = CreateMapNameMapping(); + + auto result = CreateAvroNodeWithFieldIds(avro_schema.root(), *name_mapping); + ASSERT_THAT(result, IsOk()); + + const auto& new_node = *result; + EXPECT_EQ(new_node->type(), ::avro::AVRO_RECORD); + EXPECT_EQ(new_node->names(), 2); + EXPECT_EQ(new_node->leaves(), 2); + + // Check that field IDs are properly applied to top-level fields + ASSERT_EQ(new_node->customAttributes(), 2); + ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(new_node, /*index=*/0, /*field_id=*/1)); + ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(new_node, /*index=*/1, /*field_id=*/2)); + + // Check map field structure and key-value field IDs + const auto& map_node = new_node->leafAt(1); + EXPECT_EQ(map_node->type(), ::avro::AVRO_MAP); + ASSERT_GE(map_node->leaves(), 2); + EXPECT_EQ(map_node->leafAt(0)->type(), ::avro::AVRO_STRING); + EXPECT_EQ(map_node->leafAt(1)->type(), ::avro::AVRO_STRING); Review Comment: ```suggestion EXPECT_EQ(map_node->leafAt(1)->type(), ::avro::AVRO_STRING); ASSERT_EQ(map_node->customAttributes(), 2); ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(map_node, /*index=*/0, /*field_id=*/30)); ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(map_node, /*index=*/1, /*field_id=*/31)); ``` ########## src/iceberg/avro/avro_schema_util.cc: ########## @@ -783,4 +785,232 @@ Result<SchemaProjection> Project(const Schema& expected_schema, return SchemaProjection{std::move(field_projection.children)}; } +namespace { + +Result<::avro::NodePtr> CreateRecordNodeWithFieldIds(const ::avro::NodePtr& original_node, + const MappedField& field) { + auto new_record_node = std::make_shared<::avro::NodeRecord>(); + new_record_node->setName(original_node->name()); + + for (size_t i = 0; i < original_node->leaves(); ++i) { + if (i >= original_node->names()) { + return InvalidSchema("Index {} is out of bounds for names (size: {})", i, + original_node->names()); + } + const std::string& field_name = original_node->nameAt(i); + if (i >= original_node->leaves()) { + return InvalidSchema("Index {} is out of bounds for leaves (size: {})", i, + original_node->leaves()); + } Review Comment: ```suggestion ``` This is checked already in the for loop. ########## test/avro_schema_test.cc: ########## @@ -1057,4 +1059,366 @@ TEST(AvroSchemaProjectionTest, ProjectDecimalIncompatible) { ASSERT_THAT(projection_result, HasErrorMessage("Cannot read")); } +// NameMapping tests for Avro schema context +class NameMappingAvroSchemaTest : public ::testing::Test { + protected: + // Helper function to create a simple name mapping + std::unique_ptr<NameMapping> CreateSimpleNameMapping() { + std::vector<MappedField> fields; + fields.emplace_back(MappedField{.names = {"id"}, .field_id = 1}); + fields.emplace_back(MappedField{.names = {"name"}, .field_id = 2}); + fields.emplace_back(MappedField{.names = {"age"}, .field_id = 3}); + return NameMapping::Make(std::move(fields)); + } + + // Helper function to create a nested name mapping + std::unique_ptr<NameMapping> CreateNestedNameMapping() { + std::vector<MappedField> fields; + fields.emplace_back(MappedField{.names = {"id"}, .field_id = 1}); + + // Nested mapping for address + std::vector<MappedField> address_fields; + address_fields.emplace_back(MappedField{.names = {"street"}, .field_id = 10}); + address_fields.emplace_back(MappedField{.names = {"city"}, .field_id = 11}); + address_fields.emplace_back(MappedField{.names = {"zip"}, .field_id = 12}); + auto address_mapping = MappedFields::Make(std::move(address_fields)); + + fields.emplace_back(MappedField{.names = {"address"}, + .field_id = 2, + .nested_mapping = std::move(address_mapping)}); + + return NameMapping::Make(std::move(fields)); + } + + // Helper function to create a name mapping for array types + std::unique_ptr<NameMapping> CreateArrayNameMapping() { + std::vector<MappedField> fields; + fields.emplace_back(MappedField{.names = {"id"}, .field_id = 1}); + + // Nested mapping for array element + std::vector<MappedField> element_fields; + element_fields.emplace_back(MappedField{.names = {"element"}, .field_id = 20}); + auto element_mapping = MappedFields::Make(std::move(element_fields)); + + fields.emplace_back(MappedField{ + .names = {"items"}, .field_id = 2, .nested_mapping = std::move(element_mapping)}); + + return NameMapping::Make(std::move(fields)); + } + + // Helper function to create a name mapping for map types + std::unique_ptr<NameMapping> CreateMapNameMapping() { + std::vector<MappedField> fields; + fields.emplace_back(MappedField{.names = {"id"}, .field_id = 1}); + + // Nested mapping for map key-value + std::vector<MappedField> map_fields; + map_fields.emplace_back(MappedField{.names = {"key"}, .field_id = 30}); + map_fields.emplace_back(MappedField{.names = {"value"}, .field_id = 31}); + auto map_mapping = MappedFields::Make(std::move(map_fields)); + + fields.emplace_back(MappedField{.names = {"properties"}, + .field_id = 2, + .nested_mapping = std::move(map_mapping)}); + + return NameMapping::Make(std::move(fields)); + } + + // Helper function to create a name mapping for union types + std::unique_ptr<NameMapping> CreateUnionNameMapping() { + std::vector<MappedField> fields; + fields.emplace_back(MappedField{.names = {"id"}, .field_id = 1}); + fields.emplace_back(MappedField{.names = {"data"}, .field_id = 2}); + return NameMapping::Make(std::move(fields)); + } +}; + +TEST_F(NameMappingAvroSchemaTest, ApplyNameMappingToRecord) { + // Create a simple Avro record schema without field IDs + std::string avro_schema_json = R"({ + "type": "record", + "name": "test_record", + "fields": [ + {"name": "id", "type": "int"}, + {"name": "name", "type": "string"}, + {"name": "age", "type": "int"} + ] + })"; + auto avro_schema = ::avro::compileJsonSchemaFromString(avro_schema_json); + + auto name_mapping = CreateSimpleNameMapping(); + + auto result = CreateAvroNodeWithFieldIds(avro_schema.root(), *name_mapping); + ASSERT_THAT(result, IsOk()); + + const auto& node = *result; + EXPECT_EQ(node->type(), ::avro::AVRO_RECORD); + EXPECT_EQ(node->names(), 3); + EXPECT_EQ(node->leaves(), 3); + + // Check that field IDs are properly applied + ASSERT_EQ(node->customAttributes(), 3); + ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(node, /*index=*/0, /*field_id=*/1)); + ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(node, /*index=*/1, /*field_id=*/2)); + ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(node, /*index=*/2, /*field_id=*/3)); +} + +TEST_F(NameMappingAvroSchemaTest, ApplyNameMappingToNestedRecord) { + // Create a nested Avro record schema without field IDs + std::string avro_schema_json = R"({ + "type": "record", + "name": "test_record", + "fields": [ + {"name": "id", "type": "int"}, + {"name": "address", "type": { + "type": "record", + "name": "address", + "fields": [ + {"name": "street", "type": "string"}, + {"name": "city", "type": "string"}, + {"name": "zip", "type": "string"} + ] + }} + ] + })"; + auto avro_schema = ::avro::compileJsonSchemaFromString(avro_schema_json); + + auto name_mapping = CreateNestedNameMapping(); + + auto result = CreateAvroNodeWithFieldIds(avro_schema.root(), *name_mapping); + ASSERT_THAT(result, IsOk()); + + const auto& node = *result; + EXPECT_EQ(node->type(), ::avro::AVRO_RECORD); + EXPECT_EQ(node->names(), 2); + EXPECT_EQ(node->leaves(), 2); + + // Check that field IDs are properly applied to top-level fields + ASSERT_EQ(node->customAttributes(), 2); + ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(node, /*index=*/0, /*field_id=*/1)); + ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(node, /*index=*/1, /*field_id=*/2)); + + // Check nested record + const auto& address_node = node->leafAt(1); + EXPECT_EQ(address_node->type(), ::avro::AVRO_RECORD); + EXPECT_EQ(address_node->names(), 3); + EXPECT_EQ(address_node->leaves(), 3); + + // Check that field IDs are properly applied to nested fields + ASSERT_EQ(address_node->customAttributes(), 3); + ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(address_node, /*index=*/0, /*field_id=*/10)); + ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(address_node, /*index=*/1, /*field_id=*/11)); + ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(address_node, /*index=*/2, /*field_id=*/12)); +} + +TEST_F(NameMappingAvroSchemaTest, ApplyNameMappingToArray) { + // Create an Avro array schema without field IDs + std::string avro_schema_json = R"({ + "type": "record", + "name": "test_record", + "fields": [ + {"name": "id", "type": "int"}, + {"name": "items", "type": { + "type": "array", + "items": "string" + }} + ] + })"; + auto avro_schema = ::avro::compileJsonSchemaFromString(avro_schema_json); + + auto name_mapping = CreateArrayNameMapping(); + + auto result = CreateAvroNodeWithFieldIds(avro_schema.root(), *name_mapping); + ASSERT_THAT(result, IsOk()); + + const auto& new_node = *result; + EXPECT_EQ(new_node->type(), ::avro::AVRO_RECORD); + EXPECT_EQ(new_node->names(), 2); + EXPECT_EQ(new_node->leaves(), 2); + + // Check that field IDs are properly applied to top-level fields + ASSERT_EQ(new_node->customAttributes(), 2); + ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(new_node, /*index=*/0, /*field_id=*/1)); + ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(new_node, /*index=*/1, /*field_id=*/2)); + + // Check array field structure and element field ID + const auto& array_node = new_node->leafAt(1); + EXPECT_EQ(array_node->type(), ::avro::AVRO_ARRAY); + EXPECT_EQ(array_node->leaves(), 1); + + // Check that array element has field ID applied + const auto& element_node = array_node->leafAt(0); + EXPECT_EQ(element_node->type(), ::avro::AVRO_STRING); +} + +TEST_F(NameMappingAvroSchemaTest, ApplyNameMappingToMap) { + // Create an Avro map schema without field IDs + std::string avro_schema_json = R"({ + "type": "record", + "name": "test_record", + "fields": [ + {"name": "id", "type": "int"}, + {"name": "properties", "type": { + "type": "map", Review Comment: Could you also add a test case for map type whose key is not a string type? We need to cover the case where map is an `array<struct<key,value>>` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org