cor3ntin created this revision.
cor3ntin requested review of this revision.
Herald added a project: clang.
Herald added a subscriber: cfe-commits.

This is a proposed C++ paper (P2290R1) that has not been accepted yet

\x{XXXX} \u{XXXX} and \o{OOOO} are accepted in all languages mode
in characters and string literals.


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D105737

Files:
  clang/include/clang/Basic/DiagnosticLexKinds.td
  clang/lib/Lex/LiteralSupport.cpp
  clang/test/Lexer/char-escapes-delimited.cpp

Index: clang/test/Lexer/char-escapes-delimited.cpp
===================================================================
--- /dev/null
+++ clang/test/Lexer/char-escapes-delimited.cpp
@@ -0,0 +1,37 @@
+// RUN: %clang_cc1 -fsyntax-only -pedantic -verify %s
+
+const char* errors =
+"\u{}"  //expected-error {{empty escape sequence}}
+"\u{"   //expected-error {{incomplete escape sequence}}
+"\u{h}" //expected-error {{invalid digit in escape sequence}}
+"\x{}"  //expected-error {{empty escape sequence}}
+"\x{"   //expected-error {{incomplete escape sequence}}
+"\x{h}" //expected-error {{invalid digit in escape sequence}}
+"\o{}"  //expected-error {{empty escape sequence}}
+"\o{"   //expected-error {{incomplete escape sequence}}
+"\o{8}" //expected-error {{invalid digit in escape sequence}}
+;
+
+void ucn () {
+    char a = '\u{1234}'; //expected-error {{character too large for enclosing character literal type}}
+    char32_t b  = U'\u{1234}';
+    char32_t b2 = U'\u{1}';
+    char32_t c = U'\u{000000000001234}';
+    char32_t d = U'\u{111111111}'; //expected-error {{hex escape sequence out of range}}
+}
+
+void hex () {
+    char a = '\x{1}';
+    char b = '\x{g}'; // expected-error {{invalid digit in escape sequence}}
+    char c = '\x{ff1}'; // expected-error {{hex escape sequence out of range}}
+    char32_t d = U'\x{FFFFFFFF}';
+    char32_t e = U'\x{FFFFFFFF1}';  // expected-error {{hex escape sequence out of range}}
+}
+
+void octal () {
+    char a = '\o{1}';
+    char b = '\o{8}'; // expected-error {{invalid digit in escape sequence}}
+    char c = '\o{777}'; // //expected-error {{octal escape sequence out of range}}
+    char32_t d = U'\o{37777777777}';
+    char32_t e = U'\o{47777777777}'; // expected-error {{octal escape sequence out of range}}
+}
Index: clang/lib/Lex/LiteralSupport.cpp
===================================================================
--- clang/lib/Lex/LiteralSupport.cpp
+++ clang/lib/Lex/LiteralSupport.cpp
@@ -95,6 +95,7 @@
                                   DiagnosticsEngine *Diags,
                                   const LangOptions &Features) {
   const char *EscapeBegin = ThisTokBuf;
+  bool Delimited = false;
 
   // Skip the '\' char.
   ++ThisTokBuf;
@@ -143,26 +144,45 @@
     break;
   case 'x': { // Hex escape.
     ResultChar = 0;
-    if (ThisTokBuf == ThisTokEnd || !isHexDigit(*ThisTokBuf)) {
+    if (ThisTokBuf != ThisTokEnd && *ThisTokBuf == '{') {
+      Delimited = true;
+      ThisTokBuf++;
+      if (*ThisTokBuf == '}') {
+        Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
+             diag::err_delimited_escape_empty)
+            << "x";
+        return ResultChar;
+      }
+    } else if (ThisTokBuf == ThisTokEnd || !isHexDigit(*ThisTokBuf)) {
       if (Diags)
         Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
-             diag::err_hex_escape_no_digits) << "x";
-      HadError = true;
-      break;
+             diag::err_delimited_escape_invalid)
+            << "x";
+      return ResultChar;
     }
 
     // Hex escapes are a maximal series of hex digits.
     bool Overflow = false;
     for (; ThisTokBuf != ThisTokEnd; ++ThisTokBuf) {
+      if (Delimited && *ThisTokBuf == '}') {
+        ThisTokBuf++;
+        Delimited = false;
+        break;
+      }
       int CharVal = llvm::hexDigitValue(ThisTokBuf[0]);
-      if (CharVal == -1) break;
+      if (CharVal == -1) {
+        if (Diags)
+          Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
+               diag::err_delimited_escape_invalid)
+              << "x";
+        continue;
+      }
       // About to shift out a digit?
       if (ResultChar & 0xF0000000)
         Overflow = true;
       ResultChar <<= 4;
       ResultChar |= CharVal;
     }
-
     // See if any bits will be truncated when evaluated as a character.
     if (CharWidth != 32 && (ResultChar >> CharWidth) != 0) {
       Overflow = true;
@@ -200,7 +220,57 @@
     }
     break;
   }
+  case 'o': {
+    bool Overflow = false;
+    if (ThisTokBuf == ThisTokEnd || *ThisTokBuf != '{') {
+      HadError = true;
+      if (Diags)
+        Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
+             diag::err_delimited_escape_missing_brace)
+            << 'o';
 
+      break;
+    }
+    ResultChar = 0;
+    Delimited = true;
+    ++ThisTokBuf;
+    if (*ThisTokBuf == '}') {
+      Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
+           diag::err_delimited_escape_empty)
+          << "x";
+      return ResultChar;
+    }
+
+    while (ThisTokBuf != ThisTokEnd) {
+      if (*ThisTokBuf == '}') {
+        Delimited = false;
+        ThisTokBuf++;
+        break;
+      }
+      if (*ThisTokBuf < '0' || *ThisTokBuf > '7') {
+        if (Diags)
+          Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
+               diag::err_delimited_escape_invalid)
+              << 1;
+        ThisTokBuf++;
+        continue;
+      }
+      if (ResultChar & 0x020000000)
+        Overflow = true;
+
+      ResultChar <<= 3;
+      ResultChar |= *ThisTokBuf++ - '0';
+    }
+    // Check for overflow.  Reject '\777', but not L'\777'.
+    if (Overflow || (CharWidth != 32 && (ResultChar >> CharWidth) != 0)) {
+      if (Diags)
+        Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
+             diag::err_escape_too_large)
+            << 1;
+      ResultChar &= ~0U >> (32 - CharWidth);
+    }
+    break;
+  }
     // Otherwise, these are not valid escapes.
   case '(': case '{': case '[': case '%':
     // GCC accepts these as extensions.  We warn about them as such though.
@@ -224,6 +294,12 @@
     break;
   }
 
+  if (Delimited) {
+    Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
+         diag::err_delimited_escape_incomplete)
+        << 0;
+  }
+
   return ResultChar;
 }
 
@@ -282,25 +358,72 @@
   // Skip the '\u' char's.
   ThisTokBuf += 2;
 
-  if (ThisTokBuf == ThisTokEnd || !isHexDigit(*ThisTokBuf)) {
+  bool Delimited = false;
+  bool EndDelimiterFound = false;
+
+  if (UcnBegin[1] == 'u' && in_char_string_literal &&
+      ThisTokBuf != ThisTokEnd && *ThisTokBuf == '{') {
+    Delimited = true;
+    ThisTokBuf++;
+  } else if (ThisTokBuf == ThisTokEnd || !isHexDigit(*ThisTokBuf)) {
     if (Diags)
       Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
            diag::err_hex_escape_no_digits) << StringRef(&ThisTokBuf[-1], 1);
     return false;
   }
   UcnLen = (ThisTokBuf[-1] == 'u' ? 4 : 8);
-  unsigned short UcnLenSave = UcnLen;
-  for (; ThisTokBuf != ThisTokEnd && UcnLenSave; ++ThisTokBuf, UcnLenSave--) {
+
+  bool Overflow = false;
+  unsigned short Count = 0;
+  for (; ThisTokBuf != ThisTokEnd && (Delimited || Count != UcnLen);
+       ++ThisTokBuf) {
+    if (Delimited && *ThisTokBuf == '}') {
+      ++ThisTokBuf;
+      EndDelimiterFound = true;
+      break;
+    }
     int CharVal = llvm::hexDigitValue(ThisTokBuf[0]);
-    if (CharVal == -1) break;
+    if (CharVal == -1) {
+      if (Diags) {
+        Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
+             diag::err_delimited_escape_invalid)
+            << 'u';
+      }
+      Count++;
+      continue;
+    }
+    if (UcnVal & 0xF0000000) {
+      Overflow = true;
+      continue;
+    }
     UcnVal <<= 4;
     UcnVal |= CharVal;
+    Count++;
+  }
+
+  if (Overflow) {
+    if (Diags)
+      Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
+           diag::err_escape_too_large)
+          << 0;
+    return false;
   }
+
+  if (Delimited && !EndDelimiterFound) {
+    if (Diags) {
+      Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
+           diag::err_delimited_escape_incomplete)
+          << 'u';
+    }
+    return false;
+  }
+
   // If we didn't consume the proper number of digits, there is a problem.
-  if (UcnLenSave) {
+  if (Count == 0 || (!Delimited && Count != UcnLen)) {
     if (Diags)
       Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
-           diag::err_ucn_escape_incomplete);
+           Delimited ? diag::err_delimited_escape_empty
+                     : diag::err_ucn_escape_incomplete);
     return false;
   }
 
Index: clang/include/clang/Basic/DiagnosticLexKinds.td
===================================================================
--- clang/include/clang/Basic/DiagnosticLexKinds.td
+++ clang/include/clang/Basic/DiagnosticLexKinds.td
@@ -125,6 +125,14 @@
   "identifier contains Unicode character <U+%0> that is invisible in "
   "some environments">, InGroup<DiagGroup<"unicode-zero-width">>;
 
+def err_delimited_escape_incomplete : Error<
+  "incomplete escape sequence">;
+def err_delimited_escape_empty : Error<
+  "empty escape sequence">;
+def err_delimited_escape_missing_brace: Error<
+  "expected { after \\%0 escape sequence">;
+def err_delimited_escape_invalid : Error<
+  "invalid digit in escape sequence">;
 def err_hex_escape_no_digits : Error<
   "\\%0 used with no following hex digits">;
 def warn_ucn_escape_no_digits : Warning<
_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to