cor3ntin updated this revision to Diff 438990.
cor3ntin added a comment.
Make sure the warning is off by default.
Repository:
rG LLVM Github Monorepo
CHANGES SINCE LAST ACTION
https://reviews.llvm.org/D128059/new/
https://reviews.llvm.org/D128059
Files:
clang/docs/ReleaseNotes.rst
clang/include/clang/Basic/DiagnosticLexKinds.td
clang/lib/Lex/Lexer.cpp
clang/test/Lexer/comment-invalid-utf8.c
llvm/include/llvm/Support/ConvertUTF.h
llvm/lib/Support/ConvertUTF.cpp
Index: llvm/lib/Support/ConvertUTF.cpp
===================================================================
--- llvm/lib/Support/ConvertUTF.cpp
+++ llvm/lib/Support/ConvertUTF.cpp
@@ -417,6 +417,16 @@
return isLegalUTF8(source, length);
}
+/*
+ * Exported function to return the size of the first utf-8 code unit sequence,
+ * Or 0 if the sequence is not valid;
+ */
+unsigned getUTF8SequenceSize(const UTF8 *source, const UTF8 *sourceEnd) {
+ int length = trailingBytesForUTF8[*source] + 1;
+ return (length > sourceEnd - source && isLegalUTF8(source, length)) ? length
+ : 0;
+}
+
/* --------------------------------------------------------------------- */
static unsigned
Index: llvm/include/llvm/Support/ConvertUTF.h
===================================================================
--- llvm/include/llvm/Support/ConvertUTF.h
+++ llvm/include/llvm/Support/ConvertUTF.h
@@ -181,6 +181,8 @@
Boolean isLegalUTF8String(const UTF8 **source, const UTF8 *sourceEnd);
+unsigned getUTF8SequenceSize(const UTF8 *source, const UTF8 *sourceEnd);
+
unsigned getNumBytesForUTF8(UTF8 firstByte);
/*************************************************************************/
Index: clang/lib/Lex/Lexer.cpp
===================================================================
--- clang/lib/Lex/Lexer.cpp
+++ clang/lib/Lex/Lexer.cpp
@@ -2391,13 +2391,39 @@
//
// This loop terminates with CurPtr pointing at the newline (or end of buffer)
// character that ends the line comment.
+
+ bool WarnOnInvalidUtf8 =
+ !isLexingRawMode() &&
+ !PP->getDiagnostics().isIgnored(diag::warn_invalid_utf8_in_comment,
+ getSourceLocation(CurPtr));
+ bool UnicodeDecodingAlreadyDiagnosed = false;
+
char C;
while (true) {
C = *CurPtr;
// Skip over characters in the fast loop.
- while (C != 0 && // Potentially EOF.
- C != '\n' && C != '\r') // Newline or DOS-style newline.
+ // Warn on invalid UTF-8 if the corresponding warning is enabled, emitting a
+ // diagnostic only once per sequence that cannot be decoded.
+ while ((!WarnOnInvalidUtf8 || isASCII(C)) && C != 0 && // Potentially EOF.
+ C != '\n' && C != '\r') { // Newline or DOS-style newline.
C = *++CurPtr;
+ UnicodeDecodingAlreadyDiagnosed = false;
+ }
+
+ if (WarnOnInvalidUtf8 && !isASCII(C)) {
+ unsigned Length = llvm::getUTF8SequenceSize(
+ (const llvm::UTF8 *)CurPtr, (const llvm::UTF8 *)BufferEnd);
+ if (Length == 0) {
+ if (!UnicodeDecodingAlreadyDiagnosed)
+ Diag(CurPtr, diag::warn_invalid_utf8_in_comment);
+ UnicodeDecodingAlreadyDiagnosed = true;
+ ++CurPtr;
+ } else {
+ UnicodeDecodingAlreadyDiagnosed = false;
+ CurPtr += Length;
+ }
+ continue;
+ }
const char *NextLine = CurPtr;
if (C != 0) {
@@ -2664,10 +2690,18 @@
if (C == '/')
C = *CurPtr++;
+ bool WarnOnInvalidUtf8 =
+ !isLexingRawMode() &&
+ !PP->getDiagnostics().isIgnored(diag::warn_invalid_utf8_in_comment,
+ getSourceLocation(CurPtr));
+ bool UnicodeDecodingAlreadyDiagnosed = false;
+
while (true) {
// Skip over all non-interesting characters until we find end of buffer or a
// (probably ending) '/' character.
- if (CurPtr + 24 < BufferEnd &&
+ // When diagnosing invalid UTF-8 sequences we always skip the fast
+ // vectorized path.
+ if (!WarnOnInvalidUtf8 && CurPtr + 24 < BufferEnd &&
// If there is a code-completion point avoid the fast scan because it
// doesn't check for '\0'.
!(PP && PP->getCodeCompletionFileLoc() == FileLoc)) {
@@ -2714,9 +2748,27 @@
C = *CurPtr++;
}
- // Loop to scan the remainder.
- while (C != '/' && C != '\0')
- C = *CurPtr++;
+ // Loop to scan the remainder, warning on invalid UTF-8
+ // if the corresponding warning is enabled, emitting a diagnostic only once
+ // per sequence that cannot be decoded.
+ while (C != '/' && C != '\0') {
+ if (!WarnOnInvalidUtf8 || isASCII(C)) {
+ UnicodeDecodingAlreadyDiagnosed = false;
+ C = *CurPtr++;
+ continue;
+ }
+ unsigned Length = llvm::getUTF8SequenceSize(
+ (const llvm::UTF8 *)CurPtr - 1, (const llvm::UTF8 *)BufferEnd);
+ if (Length == 0) {
+ if (!UnicodeDecodingAlreadyDiagnosed)
+ Diag(CurPtr, diag::warn_invalid_utf8_in_comment);
+ UnicodeDecodingAlreadyDiagnosed = true;
+ C = *CurPtr++;
+ continue;
+ }
+ UnicodeDecodingAlreadyDiagnosed = false;
+ C = *(CurPtr += Length - 1);
+ }
if (C == '/') {
FoundSlash:
Index: clang/include/clang/Basic/DiagnosticLexKinds.td
===================================================================
--- clang/include/clang/Basic/DiagnosticLexKinds.td
+++ clang/include/clang/Basic/DiagnosticLexKinds.td
@@ -113,6 +113,8 @@
// Unicode and UCNs
def err_invalid_utf8 : Error<
"source file is not valid UTF-8">;
+def warn_invalid_utf8_in_comment : Extension<
+ "invalid UTF-8 in comment">, InGroup<DiagGroup<"invalid-utf8">>;
def err_character_not_allowed : Error<
"unexpected character <U+%0>">;
def err_character_not_allowed_identifier : Error<
Index: clang/docs/ReleaseNotes.rst
===================================================================
--- clang/docs/ReleaseNotes.rst
+++ clang/docs/ReleaseNotes.rst
@@ -267,6 +267,8 @@
- When using class templates without arguments, clang now tells developers
that template arguments are missing in certain contexts.
This fixes `Issue 55962 <https://github.com/llvm/llvm-project/issues/55962>`_.
+- Added ``-Winvalid-utf8`` which diagnose invalid UTF-8 code unit sequences in
+ comments.
Non-comprehensive list of changes in this release
-------------------------------------------------
_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits