PR 72812 points out that Go can generate non-ASCII characters in assembly code. This is a consequence of the fact that Go permits identifiers to contain non-ASCII Unicode code points. The GNU assembler doesn't seem to mind, but the Solaris assembler does.
This patch changes the GCC Go interface to encode non-ASCII characters in identifier names by setting DECL_ASSEMBLER_NAME to an encoded value. This fixes the problem with the Solaris assembler without disturbing the debug info. Bootstrapped and ran Go testsuite on x86_64-pc-linux-gnu. Ran reflect package tests on Solaris and confirmed that they now pass. Committed to mainline. Ian 2016-08-05 Ian Lance Taylor <i...@google.com> PR go/72812 * go-gcc.cc (char_needs_encoding): New static function. (needs_encoding, fetch_utf8_char): New static functions. (encode_id): New static function. (Gcc_backend::global_variable): Set asm name if the name is not simple ASCII. (Gcc_backend::implicit_variable): Likewise. (Gcc_backend::implicit_variable_reference): Likewise. (Gcc_backend::immutable_struct): Likewise. (Gcc_backend::immutable_struct_reference): Likewise. (Gcc_backend::function): Likewise.
Index: gcc/go/go-gcc.cc =================================================================== --- gcc/go/go-gcc.cc (revision 238653) +++ gcc/go/go-gcc.cc (working copy) @@ -541,7 +541,7 @@ private: std::map<std::string, Bfunction*> builtin_functions_; }; -// A helper function. +// A helper function to create a GCC identifier from a C++ string. static inline tree get_identifier_from_string(const std::string& str) @@ -549,6 +549,102 @@ get_identifier_from_string(const std::st return get_identifier_with_length(str.data(), str.length()); } +// Return whether the character c is OK to use in the assembler. + +static bool +char_needs_encoding(char c) +{ + switch (c) + { + case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': + case 'G': case 'H': case 'I': case 'J': case 'K': case 'L': + case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R': + case 'S': case 'T': case 'U': case 'V': case 'W': case 'X': + case 'Y': case 'Z': + case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': + case 'g': case 'h': case 'i': case 'j': case 'k': case 'l': + case 'm': case 'n': case 'o': case 'p': case 'q': case 'r': + case 's': case 't': case 'u': case 'v': case 'w': case 'x': + case 'y': case 'z': + case '0': case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': + case '_': case '.': case '$': case '/': + return false; + default: + return true; + } +} + +// Return whether the identifier needs to be translated because it +// contains non-ASCII characters. + +static bool +needs_encoding(const std::string& str) +{ + for (std::string::const_iterator p = str.begin(); + p != str.end(); + ++p) + if (char_needs_encoding(*p)) + return true; + return false; +} + +// Pull the next UTF-8 character out of P and store it in *PC. Return +// the number of bytes read. + +static size_t +fetch_utf8_char(const char* p, unsigned int* pc) +{ + unsigned char c = *p; + if ((c & 0x80) == 0) + { + *pc = c; + return 1; + } + size_t len = 0; + while ((c & 0x80) != 0) + { + ++len; + c <<= 1; + } + unsigned int rc = *p & ((1 << (7 - len)) - 1); + for (size_t i = 1; i < len; i++) + { + unsigned int u = p[i]; + rc <<= 6; + rc |= u & 0x3f; + } + *pc = rc; + return len; +} + +// Encode an identifier using ASCII characters. + +static std::string +encode_id(const std::string id) +{ + std::string ret; + const char* p = id.c_str(); + const char* pend = p + id.length(); + while (p < pend) + { + unsigned int c; + size_t len = fetch_utf8_char(p, &c); + if (len == 1 && !char_needs_encoding(c)) + ret += c; + else + { + ret += "$U"; + char buf[30]; + snprintf(buf, sizeof buf, "%x", c); + ret += buf; + ret += "$"; + } + p += len; + } + return ret; +} + // Define the built-in functions that are exposed to GCCGo. Gcc_backend::Gcc_backend() @@ -2454,8 +2550,14 @@ Gcc_backend::global_variable(const std:: std::string asm_name(pkgpath); asm_name.push_back('.'); asm_name.append(name); + if (needs_encoding(asm_name)) + asm_name = encode_id(asm_name); SET_DECL_ASSEMBLER_NAME(decl, get_identifier_from_string(asm_name)); } + else if (needs_encoding(var_name)) + SET_DECL_ASSEMBLER_NAME(decl, + get_identifier_from_string(encode_id(var_name))); + TREE_USED(decl) = 1; if (in_unique_section) @@ -2690,6 +2792,8 @@ Gcc_backend::implicit_variable(const std SET_DECL_ALIGN(decl, alignment * BITS_PER_UNIT); DECL_USER_ALIGN(decl) = 1; } + if (needs_encoding(name)) + SET_DECL_ASSEMBLER_NAME(decl, get_identifier_from_string(encode_id(name))); go_preserve_from_gc(decl); return new Bvariable(decl); @@ -2742,6 +2846,8 @@ Gcc_backend::implicit_variable_reference TREE_PUBLIC(decl) = 1; TREE_STATIC(decl) = 1; DECL_ARTIFICIAL(decl) = 1; + if (needs_encoding(name)) + SET_DECL_ASSEMBLER_NAME(decl, get_identifier_from_string(encode_id(name))); go_preserve_from_gc(decl); return new Bvariable(decl); } @@ -2766,6 +2872,8 @@ Gcc_backend::immutable_struct(const std: DECL_ARTIFICIAL(decl) = 1; if (!is_hidden) TREE_PUBLIC(decl) = 1; + if (needs_encoding(name)) + SET_DECL_ASSEMBLER_NAME(decl, get_identifier_from_string(encode_id(name))); // When the initializer for one immutable_struct refers to another, // it needs to know the visibility of the referenced struct so that @@ -2840,6 +2948,8 @@ Gcc_backend::immutable_struct_reference( DECL_ARTIFICIAL(decl) = 1; TREE_PUBLIC(decl) = 1; DECL_EXTERNAL(decl) = 1; + if (needs_encoding(name)) + SET_DECL_ASSEMBLER_NAME(decl, get_identifier_from_string(encode_id(name))); go_preserve_from_gc(decl); return new Bvariable(decl); } @@ -2931,6 +3041,8 @@ Gcc_backend::function(Btype* fntype, con tree decl = build_decl(location.gcc_location(), FUNCTION_DECL, id, functype); if (!asm_name.empty()) SET_DECL_ASSEMBLER_NAME(decl, get_identifier_from_string(asm_name)); + else if (needs_encoding(name)) + SET_DECL_ASSEMBLER_NAME(decl, get_identifier_from_string(encode_id(name))); if (is_visible) TREE_PUBLIC(decl) = 1; if (is_declaration)