Fokko commented on code in PR #83: URL: https://github.com/apache/iceberg-python/pull/83#discussion_r1363628652
########## pyiceberg/schema.py: ########## @@ -1273,6 +1273,102 @@ def primitive(self, primitive: PrimitiveType) -> PrimitiveType: return primitive +# Implementation copied from Apache Iceberg repo. +def make_compatible_name(name: str) -> str: + if not _valid_avro_name(name): + return _sanitize_name(name) + return name + + +def _valid_avro_name(name: str) -> bool: + length = len(name) + assert length > 0, "Empty name" Review Comment: We avoid assertions in the code itself. If we want to check this, please raise a `ValueError` (or omit the check, do we allow empty names at all?) ########## pyiceberg/schema.py: ########## @@ -1273,6 +1273,102 @@ def primitive(self, primitive: PrimitiveType) -> PrimitiveType: return primitive +# Implementation copied from Apache Iceberg repo. +def make_compatible_name(name: str) -> str: + if not _valid_avro_name(name): + return _sanitize_name(name) + return name + + +def _valid_avro_name(name: str) -> bool: + length = len(name) + assert length > 0, "Empty name" + first = name[0] + if not (first.isalpha() or first == '_'): + return False + + for i in range(1, length): + character = name[i] + if not (character.isalnum() or character == '_'): + return False Review Comment: We could achieve the same with a slice: ```suggestion for character in name[1:]: if not (character.isalnum() or character == '_'): return False ``` Example: ``` python3 Python 3.11.6 (main, Oct 2 2023, 13:45:54) [Clang 15.0.0 (clang-1500.0.40.1)] on darwin Type "help", "copyright", "credits" or "license" for more information. >>> "abc"[1:] 'bc' ``` ########## pyiceberg/schema.py: ########## @@ -1273,6 +1273,102 @@ def primitive(self, primitive: PrimitiveType) -> PrimitiveType: return primitive +# Implementation copied from Apache Iceberg repo. +def make_compatible_name(name: str) -> str: + if not _valid_avro_name(name): + return _sanitize_name(name) + return name + + +def _valid_avro_name(name: str) -> bool: + length = len(name) + assert length > 0, "Empty name" + first = name[0] + if not (first.isalpha() or first == '_'): + return False + + for i in range(1, length): + character = name[i] + if not (character.isalnum() or character == '_'): + return False + return True + + +def _sanitize_name(name: str) -> str: + length = len(name) + sb = [] + first = name[0] + if not (first.isalpha() or first == '_'): + sb.append(_sanitize_char(first)) + else: + sb.append(first) + + for i in range(1, length): + character = name[i] + if not (character.isalnum() or character == '_'): + sb.append(_sanitize_char(character)) + else: + sb.append(character) + return ''.join(sb) + + +def _sanitize_char(character: str) -> str: + if character.isdigit(): + return "_" + character + return "_x" + hex(ord(character))[2:].upper() Review Comment: ```suggestion return "_" + character if character.isdigit() else "_x" + hex(ord(character))[2:].upper() ``` ########## pyiceberg/schema.py: ########## @@ -1273,6 +1273,102 @@ def primitive(self, primitive: PrimitiveType) -> PrimitiveType: return primitive +# Implementation copied from Apache Iceberg repo. +def make_compatible_name(name: str) -> str: + if not _valid_avro_name(name): + return _sanitize_name(name) + return name + + +def _valid_avro_name(name: str) -> bool: + length = len(name) + assert length > 0, "Empty name" + first = name[0] + if not (first.isalpha() or first == '_'): + return False + + for i in range(1, length): + character = name[i] + if not (character.isalnum() or character == '_'): + return False + return True + + +def _sanitize_name(name: str) -> str: + length = len(name) + sb = [] + first = name[0] + if not (first.isalpha() or first == '_'): + sb.append(_sanitize_char(first)) + else: + sb.append(first) + + for i in range(1, length): + character = name[i] + if not (character.isalnum() or character == '_'): + sb.append(_sanitize_char(character)) + else: + sb.append(character) Review Comment: We can also use the slice here: ```suggestion for character in name[1:]: if not (character.isalnum() or character == '_'): sb.append(_sanitize_char(character)) else: sb.append(character) ``` Or: ```suggestion for character in name[1:]: sb.append(_sanitize_char(character) if not (character.isalnum() or character == '_') else character) ``` Or: ```suggestion sb = [first] + [ _sanitize_char(character) if not (character.isalnum() or character == '_') else character ] ``` ########## pyiceberg/schema.py: ########## @@ -1273,6 +1273,102 @@ def primitive(self, primitive: PrimitiveType) -> PrimitiveType: return primitive +# Implementation copied from Apache Iceberg repo. +def make_compatible_name(name: str) -> str: + if not _valid_avro_name(name): + return _sanitize_name(name) + return name + + +def _valid_avro_name(name: str) -> bool: + length = len(name) + assert length > 0, "Empty name" + first = name[0] + if not (first.isalpha() or first == '_'): + return False + + for i in range(1, length): + character = name[i] + if not (character.isalnum() or character == '_'): + return False + return True + + +def _sanitize_name(name: str) -> str: + length = len(name) + sb = [] + first = name[0] + if not (first.isalpha() or first == '_'): + sb.append(_sanitize_char(first)) + else: + sb.append(first) + + for i in range(1, length): + character = name[i] + if not (character.isalnum() or character == '_'): + sb.append(_sanitize_char(character)) + else: + sb.append(character) + return ''.join(sb) + + +def _sanitize_char(character: str) -> str: + if character.isdigit(): + return "_" + character + return "_x" + hex(ord(character))[2:].upper() + + +def sanitize_column_names(schema: Schema) -> Schema: + """Sanitize column names to make them compatible with Avro. Review Comment: Maybe we can add here what Avro doesn't accept. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org