wgtmac commented on code in PR #190:
URL: https://github.com/apache/iceberg-cpp/pull/190#discussion_r2320821335
##########
src/iceberg/avro/avro_schema_util.cc:
##########
@@ -58,6 +58,56 @@ ::avro::CustomAttributes GetAttributesWithFieldId(int32_t
field_id) {
} // namespace
+bool validAvroName(const std::string& name) {
+ if (name.empty()) {
+ throw std::runtime_error("Empty name");
Review Comment:
We don't use exception. Please return false here.
##########
test/avro_schema_test.cc:
##########
@@ -32,6 +32,9 @@
namespace iceberg::avro {
+// Forward declaration of functions to test
+bool validAvroName(const std::string& name);
Review Comment:
Why hack this? You can declare it in avro_schema_util_internal.h
##########
test/avro_schema_test.cc:
##########
@@ -1436,5 +1576,4 @@ TEST_F(NameMappingAvroSchemaTest, MissingFieldIdError) {
auto result = MakeAvroNodeWithFieldIds(avro_schema.root(), *name_mapping);
ASSERT_THAT(result, IsError(ErrorKind::kInvalidSchema));
}
-
Review Comment:
Please revert this line.
##########
src/iceberg/avro/avro_schema_util_internal.h:
##########
@@ -156,4 +156,28 @@ bool HasMapLogicalType(const ::avro::NodePtr& node);
Result<::avro::NodePtr> MakeAvroNodeWithFieldIds(const ::avro::NodePtr&
original_node,
const NameMapping& mapping);
+/// \brief Sanitize a field name to make it compatible with Avro field name
requirements.
+///
+/// Converts names that are not valid Avro names to valid Avro names.
+/// Conversion rules:
+/// 1. If the first character is not a letter or underscore, it is specially
handled:
+/// - Digits: Prefixed with an underscore (e.g., '3' -> '_3')
+/// - Other characters: Converted to '_x' followed by the uppercase
hexadecimal
+/// representation
+/// of the character (e.g., '$' -> '_x24')
Review Comment:
```suggestion
/// representation of the character (e.g., '$' -> '_x24')
```
##########
src/iceberg/avro/avro_schema_util.cc:
##########
@@ -58,6 +58,56 @@ ::avro::CustomAttributes GetAttributesWithFieldId(int32_t
field_id) {
} // namespace
+bool validAvroName(const std::string& name) {
+ if (name.empty()) {
+ throw std::runtime_error("Empty name");
+ }
+
+ char first = name[0];
+ if (!(std::isalpha(first) || first == '_')) {
+ return false;
+ }
+
+ for (size_t i = 1; i < name.length(); i++) {
+ char character = name[i];
+ if (!(std::isalnum(character) || character == '_')) {
+ return false;
+ }
+ }
+ return true;
+}
+
+std::string SanitizeChar(char c) {
Review Comment:
Please move it into anonymous namespace.
##########
src/iceberg/avro/avro_schema_util.cc:
##########
@@ -58,6 +58,56 @@ ::avro::CustomAttributes GetAttributesWithFieldId(int32_t
field_id) {
} // namespace
+bool validAvroName(const std::string& name) {
+ if (name.empty()) {
+ throw std::runtime_error("Empty name");
+ }
+
+ char first = name[0];
+ if (!(std::isalpha(first) || first == '_')) {
+ return false;
+ }
+
+ for (size_t i = 1; i < name.length(); i++) {
+ char character = name[i];
+ if (!(std::isalnum(character) || character == '_')) {
+ return false;
+ }
+ }
+ return true;
+}
+
+std::string SanitizeChar(char c) {
+ if (std::isdigit(c)) {
+ return std::string("_") + c;
+ }
+ std::stringstream ss;
+ ss << "_x" << std::uppercase << std::hex << static_cast<int>(c);
+ return ss.str();
+}
+
+std::string SanitizeFieldName(std::string_view field_name) {
+ std::string result;
Review Comment:
Use std::ostringstream ?
##########
src/iceberg/avro/avro_schema_util.cc:
##########
@@ -58,6 +58,56 @@ ::avro::CustomAttributes GetAttributesWithFieldId(int32_t
field_id) {
} // namespace
+bool validAvroName(const std::string& name) {
+ if (name.empty()) {
+ throw std::runtime_error("Empty name");
+ }
+
+ char first = name[0];
+ if (!(std::isalpha(first) || first == '_')) {
+ return false;
+ }
+
+ for (size_t i = 1; i < name.length(); i++) {
+ char character = name[i];
+ if (!(std::isalnum(character) || character == '_')) {
+ return false;
+ }
+ }
+ return true;
+}
+
+std::string SanitizeChar(char c) {
+ if (std::isdigit(c)) {
+ return std::string("_") + c;
+ }
+ std::stringstream ss;
+ ss << "_x" << std::uppercase << std::hex << static_cast<int>(c);
+ return ss.str();
+}
+
+std::string SanitizeFieldName(std::string_view field_name) {
+ std::string result;
+ result.reserve(field_name.size());
+
+ if (!std::isalpha(field_name[0]) && field_name[0] != '_') {
Review Comment:
What if `field_name` is empty?
##########
src/iceberg/avro/avro_schema_util.cc:
##########
@@ -58,6 +58,56 @@ ::avro::CustomAttributes GetAttributesWithFieldId(int32_t
field_id) {
} // namespace
+bool validAvroName(const std::string& name) {
Review Comment:
```suggestion
bool ValidAvroName(std::string_view name) {
```
Please move it into anonymous namespace.
##########
src/iceberg/avro/avro_schema_util.cc:
##########
@@ -58,6 +58,56 @@ ::avro::CustomAttributes GetAttributesWithFieldId(int32_t
field_id) {
} // namespace
+bool validAvroName(const std::string& name) {
+ if (name.empty()) {
+ throw std::runtime_error("Empty name");
+ }
+
+ char first = name[0];
+ if (!(std::isalpha(first) || first == '_')) {
+ return false;
+ }
+
+ for (size_t i = 1; i < name.length(); i++) {
+ char character = name[i];
+ if (!(std::isalnum(character) || character == '_')) {
+ return false;
+ }
+ }
+ return true;
+}
+
+std::string SanitizeChar(char c) {
Review Comment:
```suggestion
void SanitizeChar(char c, std::ostringstream& os) {
```
It is unnecessary to create std::stringstream and std::string objects for
every char.
##########
src/iceberg/avro/avro_schema_util.cc:
##########
@@ -181,8 +231,10 @@ Status ToAvroNodeVisitor::Visit(const StructType& type,
::avro::NodePtr* node) {
::avro::NodePtr field_node;
ICEBERG_RETURN_UNEXPECTED(Visit(sub_field, &field_node));
- // TODO(gangwu): sanitize field name
- (*node)->addName(std::string(sub_field.name()));
+ bool isValidFieldName = validAvroName(std::string(sub_field.name()));
Review Comment:
If the name has been sanitized, you need to follow this:
https://github.com/apache/iceberg/blob/be03c998d96d0d1fae13aa8c53d6c7c87e2d60ba/core/src/main/java/org/apache/iceberg/avro/TypeToSchema.java#L125-L127
##########
test/avro_schema_test.cc:
##########
@@ -181,6 +258,69 @@ TEST(ToAvroNodeVisitorTest, StructType) {
EXPECT_EQ(node->leafAt(1)->leafAt(1)->type(), ::avro::AVRO_INT);
}
+TEST(ToAvroNodeVisitorTest, StructTypeWithSanitizedFieldNames) {
+ // Test struct with field names that require sanitization
+ StructType struct_type{
+ {SchemaField{/*field_id=*/1, "user-name", iceberg::string(),
+ /*optional=*/false},
+ SchemaField{/*field_id=*/2, "email.address", iceberg::string(),
+ /*optional=*/true},
+ SchemaField{/*field_id=*/3, "123field", iceberg::int32(),
+ /*optional=*/false},
+ SchemaField{/*field_id=*/4, "field with spaces", iceberg::boolean(),
+ /*optional=*/true}}};
+
+ ::avro::NodePtr node;
+ EXPECT_THAT(ToAvroNodeVisitor{}.Visit(struct_type, &node), IsOk());
+ EXPECT_EQ(node->type(), ::avro::AVRO_RECORD);
+
+ // Check that field names are sanitized
+ ASSERT_EQ(node->names(), 4);
+ EXPECT_EQ(node->nameAt(0), "user_x2Dname"); // "user-name" -> "user_x2Dname"
+ EXPECT_EQ(node->nameAt(1),
+ "email_x2Eaddress"); // "email.address" ->
"email_x2Eaddress"
+ EXPECT_EQ(node->nameAt(2), "_123field"); // "123field" -> "_123field"
+ EXPECT_EQ(
+ node->nameAt(3),
+ "field_x20with_x20spaces"); // "field with spaces" ->
"field_x20with_x20spaces"
+
+ // Check that field IDs are correctly applied
+ // Each field has 1 custom attribute: field-id
+ ASSERT_EQ(node->customAttributes(), 4);
+ ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(node, /*index=*/0, /*field_id=*/1));
+ ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(node, /*index=*/1, /*field_id=*/2));
+ ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(node, /*index=*/2, /*field_id=*/3));
+ ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(node, /*index=*/3, /*field_id=*/4));
+}
+
+TEST(ToAvroNodeVisitorTest, StructTypeWithValidFieldNames) {
Review Comment:
Can we merge this with the above case? I don't think it worths a separate
case.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]