I apologize to all who got this earlier.
I forgot to send to the list.
This patch adds UTF-8 character literals to C++:
auto c = u8'c';
This is mostly a straight forward preprocessor exercise.
I only allow these characters in -std=c++1z and -std=gnu++1z.
I figure we can expand that easily with or without some pedwarn.
In c-ada-specs.c/print_ada_macros() I just write these as a char
constant rather than spelling the token.
We could do the latter. You'd see the "u8" then I think. I couldn't
find in the Ada test suite where this was exercised.
That's all that seems controversial to me.
Built and tested clean on x86_64-linux.
Ed
libcpp:
2015-06-28 Edward Smith-Rowland <3dw...@verizon.net>
Implement N4197 - Adding u8 character literals
* include/cpplib.h (UTF8CHAR, UTF8CHAR_USERDEF): New cpp tokens;
(struct cpp_options): Add utf8_char_literals.
* init.c (struct lang_flags): Add utf8_char_literals;
(struct lang_flags lang_defaults): Add column for utf8_char_literals.
* macro.c (stringify_arg()): Treat CPP_UTF8CHAR token;
* expr.c (cpp_userdef_char_remove_type(), cpp_userdef_char_add_type()):
Treat CPP_UTF8CHAR_USERDEF, CPP_UTF8CHAR tokens;
(cpp_userdef_char_p()): Treat CPP_UTF8CHAR_USERDEF token;
(eval_token(), _cpp_parse_expr()): Treat CPP_UTF8CHAR token.
* lex.c (lex_string(), _cpp_lex_direct()): Include CPP_UTF8CHAR tokens.
* charset.c (converter_for_type(), cpp_interpret_charconst()):
Treat CPP_UTF8CHAR token.
gcc/c-family:
2015-06-28 Edward Smith-Rowland <3dw...@verizon.net>
Implement N4197 - Adding u8 character literals
* c-family/c-ada-spec.c (print_ada_macros()): Treat CPP_UTF8CHAR
like CPP_CHAR.
* c-family/c-common.c (c_parse_error()): print CPP_UTF8CHAR
and CPP_UTF8CHAR_USERDEF tokens.
* c-family/c-lex.c (c_lex_with_flags()): Treat CPP_UTF8CHAR_USERDEF
and CPP_UTF8CHAR tokens; (lex_charconst()): Treat CPP_UTF8CHAR token.
gcc/cp:
2015-06-28 Edward Smith-Rowland <3dw...@verizon.net>
Implement N4197 - Adding u8 character literals
* parser.c (cp_parser_primary_expression()): Treat CPP_UTF8CHAR
and CPP_UTF8CHAR_USERDEF tokens;
(cp_parser_parenthesized_expression_list()): Treat CPP_UTF8CHAR token.
gcc/testsuite:
2015-06-28 Edward Smith-Rowland <3dw...@verizon.net>
Implement N4197 - Adding u8 character literals
* g++.dg/cpp1z/utf8.C: New.
* g++.dg/cpp1z/utf8-neg.C: New.
* g++.dg/cpp1z/udlit-utf8char.C: New.
Index: libcpp/include/cpplib.h
===================================================================
--- libcpp/include/cpplib.h (revision 225099)
+++ libcpp/include/cpplib.h (working copy)
@@ -119,6 +119,7 @@
TK(WCHAR, LITERAL) /* L'char' */ \
TK(CHAR16, LITERAL) /* u'char' */ \
TK(CHAR32, LITERAL) /* U'char' */ \
+ TK(UTF8CHAR, LITERAL) /* u8'char' */ \
TK(OTHER, LITERAL) /* stray punctuation */ \
\
TK(STRING, LITERAL) /* "string" */ \
@@ -133,6 +134,7 @@
TK(WCHAR_USERDEF, LITERAL) /* L'char'_suffix - C++-0x */ \
TK(CHAR16_USERDEF, LITERAL) /* u'char'_suffix - C++-0x */ \
TK(CHAR32_USERDEF, LITERAL) /* U'char'_suffix - C++-0x */ \
+ TK(UTF8CHAR_USERDEF, LITERAL) /* u8'char'_suffix - C++-0x */ \
TK(STRING_USERDEF, LITERAL) /* "string"_suffix - C++-0x */ \
TK(WSTRING_USERDEF, LITERAL) /* L"string"_suffix - C++-0x */ \
TK(STRING16_USERDEF, LITERAL) /* u"string"_suffix - C++-0x */ \
@@ -339,6 +341,9 @@
/* Nonzero means process u/U prefix literals (UTF-16/32). */
unsigned char uliterals;
+ /* Nonzero means process u8 prefixed character literals (UTF-8). */
+ unsigned char utf8_char_literals;
+
/* Nonzero means process r/R raw strings. If this is set, uliterals
must be set as well. */
unsigned char rliterals;
Index: libcpp/init.c
===================================================================
--- libcpp/init.c (revision 225099)
+++ libcpp/init.c (working copy)
@@ -90,26 +90,27 @@
char binary_constants;
char digit_separators;
char trigraphs;
+ char utf8_char_literals;
};
static const struct lang_flags lang_defaults[] =
-{ /* c99 c++ xnum xid c11 std digr ulit rlit udlit bincst digsep
trig */
- /* GNUC89 */ { 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
0 },
- /* GNUC99 */ { 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0,
0 },
- /* GNUC11 */ { 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0,
0 },
- /* STDC89 */ { 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
1 },
- /* STDC94 */ { 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
1 },
- /* STDC99 */ { 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0,
1 },
- /* STDC11 */ { 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0,
1 },
- /* GNUCXX */ { 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0,
0 },
- /* CXX98 */ { 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0,
1 },
- /* GNUCXX11 */ { 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0,
0 },
- /* CXX11 */ { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
1 },
- /* GNUCXX14 */ { 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
0 },
- /* CXX14 */ { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1 },
- /* GNUCXX1Z */ { 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
0 },
- /* CXX1Z */ { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
0 },
- /* ASM */ { 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0 }
+{ /* c99 c++ xnum xid c11 std digr ulit rlit udlit bincst digsep
trig u8chlit */
+ /* GNUC89 */ { 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
0, 0 },
+ /* GNUC99 */ { 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0,
0, 0 },
+ /* GNUC11 */ { 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0,
0, 0 },
+ /* STDC89 */ { 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
1, 0 },
+ /* STDC94 */ { 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
1, 0 },
+ /* STDC99 */ { 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0,
1, 0 },
+ /* STDC11 */ { 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0,
1, 0 },
+ /* GNUCXX */ { 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0,
0, 0 },
+ /* CXX98 */ { 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0,
1, 0 },
+ /* GNUCXX11 */ { 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0,
0, 0 },
+ /* CXX11 */ { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
1, 0 },
+ /* GNUCXX14 */ { 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
0, 0 },
+ /* CXX14 */ { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 0 },
+ /* GNUCXX1Z */ { 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
0, 1 },
+ /* CXX1Z */ { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
0, 1 },
+ /* ASM */ { 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0 }
};
/* Sets internal flags correctly for a given language. */
@@ -133,6 +134,7 @@
CPP_OPTION (pfile, binary_constants) = l->binary_constants;
CPP_OPTION (pfile, digit_separators) = l->digit_separators;
CPP_OPTION (pfile, trigraphs) = l->trigraphs;
+ CPP_OPTION (pfile, utf8_char_literals) = l->utf8_char_literals;
}
/* Initialize library global state. */
Index: libcpp/macro.c
===================================================================
--- libcpp/macro.c (revision 225099)
+++ libcpp/macro.c (working copy)
@@ -531,7 +531,7 @@
|| token->type == CPP_WSTRING || token->type == CPP_WCHAR
|| token->type == CPP_STRING32 || token->type == CPP_CHAR32
|| token->type == CPP_STRING16 || token->type == CPP_CHAR16
- || token->type == CPP_UTF8STRING
+ || token->type == CPP_UTF8STRING || token->type ==
CPP_UTF8CHAR
|| cpp_userdef_string_p (token->type)
|| cpp_userdef_char_p (token->type));
Index: libcpp/expr.c
===================================================================
--- libcpp/expr.c (revision 225099)
+++ libcpp/expr.c (working copy)
@@ -307,6 +307,8 @@
return CPP_CHAR16;
else if (type == CPP_CHAR32_USERDEF)
return CPP_CHAR32;
+ else if (type == CPP_UTF8CHAR_USERDEF)
+ return CPP_UTF8CHAR;
else
return type;
}
@@ -325,6 +327,8 @@
return CPP_CHAR16_USERDEF;
else if (type == CPP_CHAR32)
return CPP_CHAR32_USERDEF;
+ else if (type == CPP_UTF8CHAR)
+ return CPP_UTF8CHAR_USERDEF;
else
return type;
}
@@ -350,7 +354,8 @@
if (type == CPP_CHAR_USERDEF
|| type == CPP_WCHAR_USERDEF
|| type == CPP_CHAR16_USERDEF
- || type == CPP_CHAR32_USERDEF)
+ || type == CPP_CHAR32_USERDEF
+ || type == CPP_UTF8CHAR_USERDEF)
return true;
else
return false;
@@ -1029,6 +1034,7 @@
case CPP_CHAR:
case CPP_CHAR16:
case CPP_CHAR32:
+ case CPP_UTF8CHAR:
{
cppchar_t cc = cpp_interpret_charconst (pfile, token,
&temp, &unsignedp);
@@ -1214,6 +1220,7 @@
case CPP_WCHAR:
case CPP_CHAR16:
case CPP_CHAR32:
+ case CPP_UTF8CHAR:
case CPP_NAME:
case CPP_HASH:
if (!want_value)
Index: libcpp/lex.c
===================================================================
--- libcpp/lex.c (revision 225099)
+++ libcpp/lex.c (working copy)
@@ -1840,7 +1840,8 @@
else if (terminator == '\'')
type = (*base == 'L' ? CPP_WCHAR :
*base == 'U' ? CPP_CHAR32 :
- *base == 'u' ? CPP_CHAR16 : CPP_CHAR);
+ *base == 'u' ? (base[1] == '8' ? CPP_UTF8CHAR : CPP_CHAR16)
+ : CPP_CHAR);
else
terminator = '>', type = CPP_HEADER_NAME;
@@ -2385,7 +2386,8 @@
&& CPP_OPTION (pfile, rliterals))
|| (*buffer->cur == '8'
&& c == 'u'
- && (buffer->cur[1] == '"'
+ && ((buffer->cur[1] == '"' || (buffer->cur[1] == '\''
+ && CPP_OPTION (pfile, utf8_char_literals)))
|| (buffer->cur[1] == 'R' && buffer->cur[2] == '"'
&& CPP_OPTION (pfile, rliterals)))))
{
Index: libcpp/charset.c
===================================================================
--- libcpp/charset.c (revision 225099)
+++ libcpp/charset.c (working copy)
@@ -1355,6 +1355,7 @@
{
default:
return pfile->narrow_cset_desc;
+ case CPP_UTF8CHAR:
case CPP_UTF8STRING:
return pfile->utf8_cset_desc;
case CPP_CHAR16:
@@ -1611,11 +1612,12 @@
unsigned int *pchars_seen, int *unsignedp)
{
cpp_string str = { 0, 0 };
- bool wide = (token->type != CPP_CHAR);
+ bool wide = (token->type != CPP_CHAR && token->type != CPP_UTF8CHAR);
+ int u8 = 2 * int(token->type == CPP_UTF8CHAR);
cppchar_t result;
- /* an empty constant will appear as L'', u'', U'' or '' */
- if (token->val.str.len == (size_t) (2 + wide))
+ /* An empty constant will appear as L'', u'', U'', u8'', or '' */
+ if (token->val.str.len == (size_t) (2 + wide + u8))
{
cpp_error (pfile, CPP_DL_ERROR, "empty character constant");
return 0;
Index: gcc/c-family/c-ada-spec.c
===================================================================
--- gcc/c-family/c-ada-spec.c (revision 225099)
+++ gcc/c-family/c-ada-spec.c (working copy)
@@ -259,6 +259,7 @@
break;
case CPP_CHAR:
+ case CPP_UTF8CHAR:
is_char = 1;
{
unsigned chars_seen;
Index: gcc/c-family/c-common.c
===================================================================
--- gcc/c-family/c-common.c (revision 225099)
+++ gcc/c-family/c-common.c (working copy)
@@ -10199,7 +10199,8 @@
else if (token_type == CPP_CHAR
|| token_type == CPP_WCHAR
|| token_type == CPP_CHAR16
- || token_type == CPP_CHAR32)
+ || token_type == CPP_CHAR32
+ || token_type == CPP_UTF8CHAR)
{
unsigned int val = TREE_INT_CST_LOW (value);
const char *prefix;
@@ -10218,6 +10219,9 @@
case CPP_CHAR32:
prefix = "U";
break;
+ case CPP_UTF8CHAR:
+ prefix = "u8";
+ break;
}
if (val <= UCHAR_MAX && ISGRAPH (val))
@@ -10232,7 +10236,8 @@
else if (token_type == CPP_CHAR_USERDEF
|| token_type == CPP_WCHAR_USERDEF
|| token_type == CPP_CHAR16_USERDEF
- || token_type == CPP_CHAR32_USERDEF)
+ || token_type == CPP_CHAR32_USERDEF
+ || token_type == CPP_UTF8CHAR_USERDEF)
message = catenate_messages (gmsgid,
" before user-defined character literal");
else if (token_type == CPP_STRING_USERDEF
Index: gcc/c-family/c-lex.c
===================================================================
--- gcc/c-family/c-lex.c (revision 225099)
+++ gcc/c-family/c-lex.c (working copy)
@@ -536,6 +536,7 @@
case CPP_WCHAR_USERDEF:
case CPP_CHAR16_USERDEF:
case CPP_CHAR32_USERDEF:
+ case CPP_UTF8CHAR_USERDEF:
{
tree literal;
cpp_token temp_tok = *tok;
@@ -553,6 +554,7 @@
case CPP_WCHAR:
case CPP_CHAR16:
case CPP_CHAR32:
+ case CPP_UTF8CHAR:
*value = lex_charconst (tok);
break;
@@ -1250,6 +1252,8 @@
type = char32_type_node;
else if (token->type == CPP_CHAR16)
type = char16_type_node;
+ else if (token->type == CPP_UTF8CHAR)
+ type = char_type_node;
/* In C, a character constant has type 'int'.
In C++ 'char', but multi-char charconsts have type 'int'. */
else if (!c_dialect_cxx () || chars_seen > 1)
Index: gcc/cp/parser.c
===================================================================
--- gcc/cp/parser.c (revision 225099)
+++ gcc/cp/parser.c (working copy)
@@ -4284,6 +4284,7 @@
case CPP_CHAR16:
case CPP_CHAR32:
case CPP_WCHAR:
+ case CPP_UTF8CHAR:
case CPP_NUMBER:
case CPP_PREPARSED_EXPR:
if (TREE_CODE (token->u.value) == USERDEF_LITERAL)
@@ -4345,6 +4346,7 @@
case CPP_CHAR16_USERDEF:
case CPP_CHAR32_USERDEF:
case CPP_WCHAR_USERDEF:
+ case CPP_UTF8CHAR_USERDEF:
return cp_parser_userdef_char_literal (parser);
case CPP_STRING:
@@ -6887,6 +6889,7 @@
case CPP_WCHAR:
case CPP_CHAR16:
case CPP_CHAR32:
+ case CPP_UTF8CHAR:
/* If a parameter is literal zero alone, remember it
for -Wmemset-transposed-args warning. */
if (integer_zerop (tok->u.value)
Index: /home/ed/gcc_var_template_2/gcc/testsuite/g++.dg/cpp1z/utf8.C
===================================================================
--- /home/ed/gcc_var_template_2/gcc/testsuite/g++.dg/cpp1z/utf8.C
(revision 0)
+++ /home/ed/gcc_var_template_2/gcc/testsuite/g++.dg/cpp1z/utf8.C
(working copy)
@@ -0,0 +1,15 @@
+// { dg-do compile }
+// { dg-options "-std=c++1z" }
+
+#include <cassert>
+#include <experimental/type_traits>
+
+auto c = 'c';
+auto u8c = u8'c';
+
+static_assert(std::experimental::is_same_v<decltype(u8c), decltype(c)>, "");
+
+auto u8s = u8"c";
+auto x = u8s[0];
+
+static_assert(std::experimental::is_same_v<decltype(u8c), decltype(x)>, "");
Index: /home/ed/gcc_var_template_2/gcc/testsuite/g++.dg/cpp1z/utf8-neg.C
===================================================================
--- /home/ed/gcc_var_template_2/gcc/testsuite/g++.dg/cpp1z/utf8-neg.C
(revision 0)
+++ /home/ed/gcc_var_template_2/gcc/testsuite/g++.dg/cpp1z/utf8-neg.C
(working copy)
@@ -0,0 +1,7 @@
+/* { dg-do compile } */
+/* { dg-options "-std=c++1z" } */
+
+const static char c0 = u8''; // { dg-error "empty character" }
+const static char c1 = u8'ab'; // { dg-warning "multi-character
character constant" }
+const static char c2 = u8'\u0124'; // { dg-warning "multi-character
character constant" }
+const static char c3 = u8'\U00064321'; // { dg-warning "multi-character
character constant" }
Index: /home/ed/gcc_var_template_2/gcc/testsuite/g++.dg/cpp1z/udlit-utf8char.C
===================================================================
--- /home/ed/gcc_var_template_2/gcc/testsuite/g++.dg/cpp1z/udlit-utf8char.C
(revision 0)
+++ /home/ed/gcc_var_template_2/gcc/testsuite/g++.dg/cpp1z/udlit-utf8char.C
(working copy)
@@ -0,0 +1,8 @@
+// { dg-do compile }
+// { dg-options "-std=c++1z" }
+
+constexpr int
+operator""_foo(char c)
+{ return c * 100; }
+
+auto cc = u8'8'_foo;