This revision was landed with ongoing or failed builds.
This revision was automatically updated to reflect the committed changes.
Closed by commit rG4174f0ca618b: [Clang] Add a warning on invalid UTF-8 in
comments. (authored by cor3ntin).
Repository:
rG LLVM Github Monorepo
CHANGES SINCE LAST ACTION
https://reviews.llvm.org/D128059/new/
https://reviews.llvm.org/D128059
Files:
clang/docs/ReleaseNotes.rst
clang/include/clang/Basic/DiagnosticLexKinds.td
clang/lib/Lex/Lexer.cpp
clang/test/Lexer/comment-invalid-utf8.c
clang/test/SemaCXX/static-assert.cpp
llvm/include/llvm/Support/ConvertUTF.h
llvm/lib/Support/ConvertUTF.cpp
Index: llvm/lib/Support/ConvertUTF.cpp
===================================================================
--- llvm/lib/Support/ConvertUTF.cpp
+++ llvm/lib/Support/ConvertUTF.cpp
@@ -417,6 +417,16 @@
return isLegalUTF8(source, length);
}
+/*
+ * Exported function to return the size of the first utf-8 code unit sequence,
+ * Or 0 if the sequence is not valid;
+ */
+unsigned getUTF8SequenceSize(const UTF8 *source, const UTF8 *sourceEnd) {
+ int length = trailingBytesForUTF8[*source] + 1;
+ return (length > sourceEnd - source && isLegalUTF8(source, length)) ? length
+ : 0;
+}
+
/* --------------------------------------------------------------------- */
static unsigned
Index: llvm/include/llvm/Support/ConvertUTF.h
===================================================================
--- llvm/include/llvm/Support/ConvertUTF.h
+++ llvm/include/llvm/Support/ConvertUTF.h
@@ -181,6 +181,8 @@
Boolean isLegalUTF8String(const UTF8 **source, const UTF8 *sourceEnd);
+unsigned getUTF8SequenceSize(const UTF8 *source, const UTF8 *sourceEnd);
+
unsigned getNumBytesForUTF8(UTF8 firstByte);
/*************************************************************************/
Index: clang/test/SemaCXX/static-assert.cpp
===================================================================
--- clang/test/SemaCXX/static-assert.cpp
+++ clang/test/SemaCXX/static-assert.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fsyntax-only -verify %s -std=c++11 -pedantic -triple=x86_64-linux-gnu
+// RUN: %clang_cc1 -fsyntax-only -verify %s -std=c++11 -pedantic -triple=x86_64-linux-gnu -Wno-invalid-utf8
int f(); // expected-note {{declared here}}
Index: clang/test/Lexer/comment-invalid-utf8.c
===================================================================
--- /dev/null
+++ clang/test/Lexer/comment-invalid-utf8.c
@@ -0,0 +1,27 @@
+// RUN: %clang_cc1 -fsyntax-only %s -Winvalid-utf8 -verify=expected
+// RUN: %clang_cc1 -fsyntax-only %s -verify=nowarn
+// nowarn-no-diagnostics
+
+// This file is purposefully encoded as windows-1252
+// be careful when modifying.
+
+//Â
+// expected-warning@-1 {{invalid UTF-8 in comment}}
+
+// Â ÂÂÂÂ
ÂÂÂÂ Â Â Â Â
+// expected-warning@-1 6{{invalid UTF-8 in comment}}
+
+/*Â*/
+// expected-warning@-1 {{invalid UTF-8 in comment}}
+
+/*Â ÂÂÂÂ
ÂÂÂÂ Â Â Â Â*/
+// expected-warning@-1 6{{invalid UTF-8 in comment}}
+
+/*
+Â
+*/
+// expected-warning@-2 {{invalid UTF-8 in comment}}
+
+// abcd
+// Âabcd
+// expected-warning@-1 {{invalid UTF-8 in comment}}
Index: clang/lib/Lex/Lexer.cpp
===================================================================
--- clang/lib/Lex/Lexer.cpp
+++ clang/lib/Lex/Lexer.cpp
@@ -2392,13 +2392,37 @@
//
// This loop terminates with CurPtr pointing at the newline (or end of buffer)
// character that ends the line comment.
+
+ // C++23 [lex.phases] p1
+ // Diagnose invalid UTF-8 if the corresponding warning is enabled, emitting a
+ // diagnostic only once per entire ill-formed subsequence to avoid
+ // emiting to many diagnostics (see http://unicode.org/review/pr-121.html).
+ bool UnicodeDecodingAlreadyDiagnosed = false;
+
char C;
while (true) {
C = *CurPtr;
// Skip over characters in the fast loop.
- while (C != 0 && // Potentially EOF.
- C != '\n' && C != '\r') // Newline or DOS-style newline.
+ while (isASCII(C) && C != 0 && // Potentially EOF.
+ C != '\n' && C != '\r') { // Newline or DOS-style newline.
C = *++CurPtr;
+ UnicodeDecodingAlreadyDiagnosed = false;
+ }
+
+ if (!isASCII(C)) {
+ unsigned Length = llvm::getUTF8SequenceSize(
+ (const llvm::UTF8 *)CurPtr, (const llvm::UTF8 *)BufferEnd);
+ if (Length == 0) {
+ if (!UnicodeDecodingAlreadyDiagnosed && !isLexingRawMode())
+ Diag(CurPtr, diag::warn_invalid_utf8_in_comment);
+ UnicodeDecodingAlreadyDiagnosed = true;
+ ++CurPtr;
+ } else {
+ UnicodeDecodingAlreadyDiagnosed = false;
+ CurPtr += Length;
+ }
+ continue;
+ }
const char *NextLine = CurPtr;
if (C != 0) {
@@ -2665,6 +2689,12 @@
if (C == '/')
C = *CurPtr++;
+ // C++23 [lex.phases] p1
+ // Diagnose invalid UTF-8 if the corresponding warning is enabled, emitting a
+ // diagnostic only once per entire ill-formed subsequence to avoid
+ // emiting to many diagnostics (see http://unicode.org/review/pr-121.html).
+ bool UnicodeDecodingAlreadyDiagnosed = false;
+
while (true) {
// Skip over all non-interesting characters until we find end of buffer or a
// (probably ending) '/' character.
@@ -2673,14 +2703,24 @@
// doesn't check for '\0'.
!(PP && PP->getCodeCompletionFileLoc() == FileLoc)) {
// While not aligned to a 16-byte boundary.
- while (C != '/' && ((intptr_t)CurPtr & 0x0F) != 0)
+ while (C != '/' && (intptr_t)CurPtr % 16 != 0) {
+ if (!isASCII(C)) {
+ CurPtr--;
+ goto MultiByteUTF8;
+ }
C = *CurPtr++;
-
+ }
if (C == '/') goto FoundSlash;
#ifdef __SSE2__
__m128i Slashes = _mm_set1_epi8('/');
- while (CurPtr+16 <= BufferEnd) {
+ while (CurPtr + 16 < BufferEnd) {
+ int Mask = _mm_movemask_epi8(*(const __m128i *)CurPtr);
+ if (LLVM_UNLIKELY(Mask != 0)) {
+ CurPtr += llvm::countTrailingZeros<unsigned>(Mask);
+ goto MultiByteUTF8;
+ }
+ // look for slashes
int cmp = _mm_movemask_epi8(_mm_cmpeq_epi8(*(const __m128i*)CurPtr,
Slashes));
if (cmp != 0) {
@@ -2693,21 +2733,41 @@
CurPtr += 16;
}
#elif __ALTIVEC__
+ __vector unsigned char LongUTF = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80};
__vector unsigned char Slashes = {
'/', '/', '/', '/', '/', '/', '/', '/',
'/', '/', '/', '/', '/', '/', '/', '/'
};
- while (CurPtr + 16 <= BufferEnd &&
- !vec_any_eq(*(const __vector unsigned char *)CurPtr, Slashes))
+ while (CurPtr + 16 < BufferEnd) {
+ if (LLVM_UNLIKELY(
+ vec_any_ge(*(const __vector unsigned char *)CurPtr, LongUTF)))
+ goto MultiByteUTF8;
+ if (vec_any_eq(*(const __vector unsigned char *)CurPtr, Slashes)) {
+ C = *CurPtr++;
+ break;
+ }
CurPtr += 16;
+ }
+
#else
- // Scan for '/' quickly. Many block comments are very large.
- while (CurPtr[0] != '/' &&
- CurPtr[1] != '/' &&
- CurPtr[2] != '/' &&
- CurPtr[3] != '/' &&
- CurPtr+4 < BufferEnd) {
- CurPtr += 4;
+ while (CurPtr + 16 <= BufferEnd) {
+ bool HasNonASCII = false;
+ for (unsigned I = 0; I < 16; ++I) {
+ HasNonASCII |= !isASCII(CurPtr[I]);
+ }
+
+ if (LLVM_UNLIKELY(HasNonASCII))
+ goto MultiByteUTF8;
+
+ bool HasSlash = false;
+ for (unsigned I = 0; I < 16; ++I) {
+ HasSlash |= CurPtr[I] == '/';
+ }
+ if (HasSlash)
+ break;
+ CurPtr += 16;
}
#endif
@@ -2715,9 +2775,28 @@
C = *CurPtr++;
}
- // Loop to scan the remainder.
- while (C != '/' && C != '\0')
- C = *CurPtr++;
+ // Loop to scan the remainder, warning on invalid UTF-8
+ // if the corresponding warning is enabled, emitting a diagnostic only once
+ // per sequence that cannot be decoded.
+ while (C != '/' && C != '\0') {
+ if (isASCII(C)) {
+ UnicodeDecodingAlreadyDiagnosed = false;
+ C = *CurPtr++;
+ continue;
+ }
+ MultiByteUTF8:
+ unsigned Length = llvm::getUTF8SequenceSize(
+ (const llvm::UTF8 *)CurPtr, (const llvm::UTF8 *)BufferEnd);
+ if (Length == 0) {
+ if (!UnicodeDecodingAlreadyDiagnosed && !isLexingRawMode())
+ Diag(CurPtr, diag::warn_invalid_utf8_in_comment);
+ UnicodeDecodingAlreadyDiagnosed = true;
+ C = *CurPtr++;
+ continue;
+ }
+ UnicodeDecodingAlreadyDiagnosed = false;
+ C = *(CurPtr += Length - 1);
+ }
if (C == '/') {
FoundSlash:
Index: clang/include/clang/Basic/DiagnosticLexKinds.td
===================================================================
--- clang/include/clang/Basic/DiagnosticLexKinds.td
+++ clang/include/clang/Basic/DiagnosticLexKinds.td
@@ -113,6 +113,8 @@
// Unicode and UCNs
def err_invalid_utf8 : Error<
"source file is not valid UTF-8">;
+def warn_invalid_utf8_in_comment : Extension<
+ "invalid UTF-8 in comment">, InGroup<DiagGroup<"invalid-utf8">>;
def err_character_not_allowed : Error<
"unexpected character <U+%0>">;
def err_character_not_allowed_identifier : Error<
Index: clang/docs/ReleaseNotes.rst
===================================================================
--- clang/docs/ReleaseNotes.rst
+++ clang/docs/ReleaseNotes.rst
@@ -279,6 +279,8 @@
unevaluated operands of a ``typeid`` expression, as they are now
modeled correctly in the CFG. This fixes
`Issue 21668 <https://github.com/llvm/llvm-project/issues/21668>`_.
+- Added ``-Winvalid-utf8`` which diagnoses invalid UTF-8 code unit sequences in
+ comments.
Non-comprehensive list of changes in this release
-------------------------------------------------
@@ -576,7 +578,7 @@
- Added ``forEachTemplateArgument`` matcher which creates a match every
time a ``templateArgument`` matches the matcher supplied to it.
-
+
- Added ``objcStringLiteral`` matcher which matches ObjectiveC String
literal expressions.
_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits