llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT--> @llvm/pr-subscribers-llvm-support Author: Abhina Sree (abhina-sree) <details> <summary>Changes</summary> This patch builds upon https://github.com/llvm/llvm-project/pull/138895 and introduces a ParserConversionAction which is able to control which charset to use for various string literals. I also introduce a FormatStrConverter which is used to do format string checking --- Patch is 58.80 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/169803.diff 22 Files Affected: - (modified) clang/include/clang/AST/Expr.h (+6) - (modified) clang/include/clang/AST/FormatString.h (+7-6) - (modified) clang/include/clang/Basic/TargetInfo.h (+3) - (modified) clang/include/clang/Lex/LiteralConverter.h (+1-1) - (modified) clang/include/clang/Parse/Parser.h (+1) - (modified) clang/include/clang/Sema/Sema.h (+6-2) - (modified) clang/lib/AST/Expr.cpp (+15) - (modified) clang/lib/AST/FormatString.cpp (+126-116) - (modified) clang/lib/AST/FormatStringParsing.h (+23-16) - (modified) clang/lib/AST/PrintfFormatString.cpp (+84-65) - (modified) clang/lib/AST/ScanfFormatString.cpp (+19-12) - (modified) clang/lib/Basic/TargetInfo.cpp (+3) - (modified) clang/lib/Lex/LiteralConverter.cpp (+9-1) - (modified) clang/lib/Parse/ParseDecl.cpp (+13) - (modified) clang/lib/Parse/ParseDeclCXX.cpp (+7-3) - (modified) clang/lib/Parse/ParseExpr.cpp (+5-4) - (modified) clang/lib/Parse/Parser.cpp (+4) - (modified) clang/lib/Sema/SemaChecking.cpp (+40-36) - (modified) clang/lib/Sema/SemaExpr.cpp (+10-7) - (modified) clang/test/CodeGen/systemz-charset.c (+8) - (modified) llvm/include/llvm/Support/TextEncoding.h (+10) - (modified) llvm/lib/Support/TextEncoding.cpp (+19) ``````````diff diff --git a/clang/include/clang/AST/Expr.h b/clang/include/clang/AST/Expr.h index 573cc72db35c6..7d1ac3193812f 100644 --- a/clang/include/clang/AST/Expr.h +++ b/clang/include/clang/AST/Expr.h @@ -28,6 +28,7 @@ #include "clang/Basic/LangOptions.h" #include "clang/Basic/SyncScope.h" #include "clang/Basic/TypeTraits.h" +#include "clang/Lex/LiteralConverter.h" #include "llvm/ADT/APFloat.h" #include "llvm/ADT/APSInt.h" #include "llvm/ADT/SmallVector.h" @@ -2063,6 +2064,11 @@ class PredefinedExpr final return getIdentKindName(getIdentKind()); } + static std::string + ComputeNameAndTranslate(PredefinedIdentKind IK, const Decl *CurrentDecl, + LiteralConverter &LiteralConv, + bool ForceElaboratedPrinting = false); + static std::string ComputeName(PredefinedIdentKind IK, const Decl *CurrentDecl, bool ForceElaboratedPrinting = false); diff --git a/clang/include/clang/AST/FormatString.h b/clang/include/clang/AST/FormatString.h index a284f2c44d633..12083a0d00b4b 100644 --- a/clang/include/clang/AST/FormatString.h +++ b/clang/include/clang/AST/FormatString.h @@ -19,6 +19,7 @@ #define LLVM_CLANG_AST_FORMATSTRING_H #include "clang/AST/CanonicalType.h" +#include "llvm/Support/TextEncoding.h" #include <optional> namespace clang { @@ -744,9 +745,9 @@ class FormatStringHandler { // Printf-specific handlers. virtual bool HandleInvalidPrintfConversionSpecifier( - const analyze_printf::PrintfSpecifier &FS, - const char *startSpecifier, - unsigned specifierLen) { + const analyze_printf::PrintfSpecifier &FS, const char *startSpecifier, + unsigned specifierLen, + const llvm::TextEncodingConverter &FormatStrConverter) { return true; } @@ -763,9 +764,9 @@ class FormatStringHandler { // Scanf-specific handlers. virtual bool HandleInvalidScanfConversionSpecifier( - const analyze_scanf::ScanfSpecifier &FS, - const char *startSpecifier, - unsigned specifierLen) { + const analyze_scanf::ScanfSpecifier &FS, const char *startSpecifier, + unsigned specifierLen, + const llvm::TextEncodingConverter &FormatStrConverter) { return true; } diff --git a/clang/include/clang/Basic/TargetInfo.h b/clang/include/clang/Basic/TargetInfo.h index 1c16f9f79ae68..b3d507e1170dc 100644 --- a/clang/include/clang/Basic/TargetInfo.h +++ b/clang/include/clang/Basic/TargetInfo.h @@ -38,6 +38,7 @@ #include "llvm/IR/DerivedTypes.h" #include "llvm/Support/DataTypes.h" #include "llvm/Support/Error.h" +#include "llvm/Support/TextEncoding.h" #include "llvm/Support/VersionTuple.h" #include "llvm/TargetParser/Triple.h" #include <cassert> @@ -320,6 +321,8 @@ class TargetInfo : public TransferrableTargetInfo, virtual ~TargetInfo(); + llvm::TextEncodingConverter *FormatStrConverter; + /// Retrieve the target options. TargetOptions &getTargetOpts() const { assert(TargetOpts && "Missing target options"); diff --git a/clang/include/clang/Lex/LiteralConverter.h b/clang/include/clang/Lex/LiteralConverter.h index 6a66d2d0ff707..ba6fb6c87a782 100644 --- a/clang/include/clang/Lex/LiteralConverter.h +++ b/clang/include/clang/Lex/LiteralConverter.h @@ -34,7 +34,7 @@ class LiteralConverter { static std::error_code setConvertersFromOptions(LiteralConverter &LiteralConv, const clang::LangOptions &Opts, - const clang::TargetInfo &TInfo); + clang::TargetInfo &TInfo); }; #endif diff --git a/clang/include/clang/Parse/Parser.h b/clang/include/clang/Parse/Parser.h index 58eb1c0a7c114..97867183b5a1d 100644 --- a/clang/include/clang/Parse/Parser.h +++ b/clang/include/clang/Parse/Parser.h @@ -5633,6 +5633,7 @@ class Parser : public CodeCompletionHandler { bool Finished; }; ObjCImplParsingDataRAII *CurParsedObjCImpl; + ConversionAction ParserConversionAction; /// StashAwayMethodOrFunctionBodyTokens - Consume the tokens and store them /// for later parsing. diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h index cbfcc9bc0ea99..65567e367dea4 100644 --- a/clang/include/clang/Sema/Sema.h +++ b/clang/include/clang/Sema/Sema.h @@ -54,6 +54,7 @@ #include "clang/Basic/TemplateKinds.h" #include "clang/Basic/TokenKinds.h" #include "clang/Basic/TypeTraits.h" +#include "clang/Lex/LiteralConverter.h" #include "clang/Sema/AnalysisBasedWarnings.h" #include "clang/Sema/Attr.h" #include "clang/Sema/CleanupInfo.h" @@ -7272,9 +7273,12 @@ class Sema final : public SemaBase { /// from multiple tokens. However, the common case is that StringToks points /// to one string. ExprResult ActOnStringLiteral(ArrayRef<Token> StringToks, - Scope *UDLScope = nullptr); + Scope *UDLScope = nullptr, + ConversionAction Action = CA_ToExecEncoding); - ExprResult ActOnUnevaluatedStringLiteral(ArrayRef<Token> StringToks); + ExprResult + ActOnUnevaluatedStringLiteral(ArrayRef<Token> StringToks, + ConversionAction Action = CA_ToExecEncoding); /// ControllingExprOrType is either an opaque pointer coming out of a /// ParsedType or an Expr *. FIXME: it'd be better to split this interface diff --git a/clang/lib/AST/Expr.cpp b/clang/lib/AST/Expr.cpp index 1d914fa876759..d9765f4a73fcd 100644 --- a/clang/lib/AST/Expr.cpp +++ b/clang/lib/AST/Expr.cpp @@ -667,6 +667,21 @@ StringRef PredefinedExpr::getIdentKindName(PredefinedIdentKind IK) { llvm_unreachable("Unknown ident kind for PredefinedExpr"); } +std::string PredefinedExpr::ComputeNameAndTranslate( + PredefinedIdentKind IK, const Decl *CurrentDecl, + LiteralConverter &LiteralConv, bool ForceElaboratedPrinting) { + using namespace clang::charinfo; + std::string Result = ComputeName(IK, CurrentDecl, ForceElaboratedPrinting); + llvm::TextEncodingConverter *Converter = + LiteralConv.getConverter(CA_ToExecEncoding); + if (Converter) { + SmallString<128> Converted; + Converter->convert(Result, Converted); + Result = std::string(Converted); + } + return Result; +} + // FIXME: Maybe this should use DeclPrinter with a special "print predefined // expr" policy instead. std::string PredefinedExpr::ComputeName(PredefinedIdentKind IK, diff --git a/clang/lib/AST/FormatString.cpp b/clang/lib/AST/FormatString.cpp index d4cb89b43ae87..be0f527da92e5 100644 --- a/clang/lib/AST/FormatString.cpp +++ b/clang/lib/AST/FormatString.cpp @@ -33,8 +33,9 @@ FormatStringHandler::~FormatStringHandler() {} // scanf format strings. //===----------------------------------------------------------------------===// -OptionalAmount -clang::analyze_format_string::ParseAmount(const char *&Beg, const char *E) { +OptionalAmount clang::analyze_format_string::ParseAmount( + const char *&Beg, const char *E, + const llvm::TextEncodingConverter &FormatStrConverter) { const char *I = Beg; UpdateOnReturn <const char*> UpdateBeg(Beg, I); @@ -42,7 +43,7 @@ clang::analyze_format_string::ParseAmount(const char *&Beg, const char *E) { bool hasDigits = false; for ( ; I != E; ++I) { - char c = *I; + char c = FormatStrConverter.convert(*I); if (c >= '0' && c <= '9') { hasDigits = true; accumulator = (accumulator * 10) + (c - '0'); @@ -59,27 +60,23 @@ clang::analyze_format_string::ParseAmount(const char *&Beg, const char *E) { return OptionalAmount(); } -OptionalAmount -clang::analyze_format_string::ParseNonPositionAmount(const char *&Beg, - const char *E, - unsigned &argIndex) { - if (*Beg == '*') { +OptionalAmount clang::analyze_format_string::ParseNonPositionAmount( + const char *&Beg, const char *E, unsigned &argIndex, + const llvm::TextEncodingConverter &FormatStrConverter) { + if (FormatStrConverter.convert(*Beg) == '*') { ++Beg; return OptionalAmount(OptionalAmount::Arg, argIndex++, Beg, 0, false); } - return ParseAmount(Beg, E); + return ParseAmount(Beg, E, FormatStrConverter); } -OptionalAmount -clang::analyze_format_string::ParsePositionAmount(FormatStringHandler &H, - const char *Start, - const char *&Beg, - const char *E, - PositionContext p) { - if (*Beg == '*') { +OptionalAmount clang::analyze_format_string::ParsePositionAmount( + FormatStringHandler &H, const char *Start, const char *&Beg, const char *E, + PositionContext p, const llvm::TextEncodingConverter &FormatStrConverter) { + if (FormatStrConverter.convert(*Beg) == '*') { const char *I = Beg + 1; - const OptionalAmount &Amt = ParseAmount(I, E); + const OptionalAmount &Amt = ParseAmount(I, E, FormatStrConverter); if (Amt.getHowSpecified() == OptionalAmount::NotSpecified) { H.HandleInvalidPosition(Beg, I - Beg, p); @@ -94,7 +91,7 @@ clang::analyze_format_string::ParsePositionAmount(FormatStringHandler &H, assert(Amt.getHowSpecified() == OptionalAmount::Constant); - if (*I == '$') { + if (FormatStrConverter.convert(*I) == '$') { // Handle positional arguments // Special case: '*0$', since this is an easy mistake. @@ -114,24 +111,22 @@ clang::analyze_format_string::ParsePositionAmount(FormatStringHandler &H, return OptionalAmount(false); } - return ParseAmount(Beg, E); + return ParseAmount(Beg, E, FormatStrConverter); } - -bool -clang::analyze_format_string::ParseFieldWidth(FormatStringHandler &H, - FormatSpecifier &CS, - const char *Start, - const char *&Beg, const char *E, - unsigned *argIndex) { +bool clang::analyze_format_string::ParseFieldWidth( + FormatStringHandler &H, FormatSpecifier &CS, const char *Start, + const char *&Beg, const char *E, unsigned *argIndex, + const llvm::TextEncodingConverter &FormatStrConverter) { // FIXME: Support negative field widths. if (argIndex) { - CS.setFieldWidth(ParseNonPositionAmount(Beg, E, *argIndex)); + CS.setFieldWidth( + ParseNonPositionAmount(Beg, E, *argIndex, FormatStrConverter)); } else { - const OptionalAmount Amt = - ParsePositionAmount(H, Start, Beg, E, - analyze_format_string::FieldWidthPos); + const OptionalAmount Amt = ParsePositionAmount( + H, Start, Beg, E, analyze_format_string::FieldWidthPos, + FormatStrConverter); if (Amt.isInvalid()) return true; @@ -140,15 +135,13 @@ clang::analyze_format_string::ParseFieldWidth(FormatStringHandler &H, return false; } -bool -clang::analyze_format_string::ParseArgPosition(FormatStringHandler &H, - FormatSpecifier &FS, - const char *Start, - const char *&Beg, - const char *E) { +bool clang::analyze_format_string::ParseArgPosition( + FormatStringHandler &H, FormatSpecifier &FS, const char *Start, + const char *&Beg, const char *E, + const llvm::TextEncodingConverter &FormatStrConverter) { const char *I = Beg; - const OptionalAmount &Amt = ParseAmount(I, E); + const OptionalAmount &Amt = ParseAmount(I, E, FormatStrConverter); if (I == E) { // No more characters left? @@ -156,7 +149,8 @@ clang::analyze_format_string::ParseArgPosition(FormatStringHandler &H, return true; } - if (Amt.getHowSpecified() == OptionalAmount::Constant && *(I++) == '$') { + if (Amt.getHowSpecified() == OptionalAmount::Constant && + FormatStrConverter.convert(*(I++)) == '$') { // Warn that positional arguments are non-standard. H.HandlePosition(Start, I - Start); @@ -177,17 +171,15 @@ clang::analyze_format_string::ParseArgPosition(FormatStringHandler &H, return false; } -bool -clang::analyze_format_string::ParseVectorModifier(FormatStringHandler &H, - FormatSpecifier &FS, - const char *&I, - const char *E, - const LangOptions &LO) { +bool clang::analyze_format_string::ParseVectorModifier( + FormatStringHandler &H, FormatSpecifier &FS, const char *&I, const char *E, + const LangOptions &LO, + const llvm::TextEncodingConverter &FormatStrConverter) { if (!LO.OpenCL) return false; const char *Start = I; - if (*I == 'v') { + if (FormatStrConverter.convert(*I) == 'v') { ++I; if (I == E) { @@ -195,7 +187,7 @@ clang::analyze_format_string::ParseVectorModifier(FormatStringHandler &H, return true; } - OptionalAmount NumElts = ParseAmount(I, E); + OptionalAmount NumElts = ParseAmount(I, E, FormatStrConverter); if (NumElts.getHowSpecified() != OptionalAmount::Constant) { H.HandleIncompleteSpecifier(Start, E - Start); return true; @@ -207,86 +199,104 @@ clang::analyze_format_string::ParseVectorModifier(FormatStringHandler &H, return false; } -bool -clang::analyze_format_string::ParseLengthModifier(FormatSpecifier &FS, - const char *&I, - const char *E, - const LangOptions &LO, - bool IsScanf) { +bool clang::analyze_format_string::ParseLengthModifier( + FormatSpecifier &FS, const char *&I, const char *E, const LangOptions &LO, + const llvm::TextEncodingConverter &FormatStrConverter, bool IsScanf) { LengthModifier::Kind lmKind = LengthModifier::None; const char *lmPosition = I; - switch (*I) { - default: - return false; - case 'h': + switch (FormatStrConverter.convert(*I)) { + default: + return false; + case 'h': + ++I; + if (I != E && FormatStrConverter.convert(*I) == 'h') { ++I; - if (I != E && *I == 'h') { - ++I; - lmKind = LengthModifier::AsChar; - } else if (I != E && *I == 'l' && LO.OpenCL) { - ++I; - lmKind = LengthModifier::AsShortLong; - } else { - lmKind = LengthModifier::AsShort; - } - break; - case 'l': + lmKind = LengthModifier::AsChar; + } else if (I != E && FormatStrConverter.convert(*I) == 'l' && LO.OpenCL) { + ++I; + lmKind = LengthModifier::AsShortLong; + } else { + lmKind = LengthModifier::AsShort; + } + break; + case 'l': + ++I; + if (I != E && FormatStrConverter.convert(*I) == 'l') { + ++I; + lmKind = LengthModifier::AsLongLong; + } else { + lmKind = LengthModifier::AsLong; + } + break; + case 'j': + lmKind = LengthModifier::AsIntMax; + ++I; + break; + case 'z': + lmKind = LengthModifier::AsSizeT; + ++I; + break; + case 't': + lmKind = LengthModifier::AsPtrDiff; + ++I; + break; + case 'L': + lmKind = LengthModifier::AsLongDouble; + ++I; + break; + case 'q': + lmKind = LengthModifier::AsQuad; + ++I; + break; + case 'a': + if (IsScanf && !LO.C99 && !LO.CPlusPlus11) { + // For scanf in C90, look at the next character to see if this should + // be parsed as the GNU extension 'a' length modifier. If not, this + // will be parsed as a conversion specifier. ++I; - if (I != E && *I == 'l') { - ++I; - lmKind = LengthModifier::AsLongLong; - } else { - lmKind = LengthModifier::AsLong; + if (I != E && (FormatStrConverter.convert(*I) == 's' || + FormatStrConverter.convert(*I) == 'S' || + FormatStrConverter.convert(*I) == '[')) { + lmKind = LengthModifier::AsAllocate; + break; } + --I; + } + return false; + case 'm': + if (IsScanf) { + lmKind = LengthModifier::AsMAllocate; + ++I; break; - case 'j': lmKind = LengthModifier::AsIntMax; ++I; break; - case 'z': lmKind = LengthModifier::AsSizeT; ++I; break; - case 't': lmKind = LengthModifier::AsPtrDiff; ++I; break; - case 'L': lmKind = LengthModifier::AsLongDouble; ++I; break; - case 'q': lmKind = LengthModifier::AsQuad; ++I; break; - case 'a': - if (IsScanf && !LO.C99 && !LO.CPlusPlus11) { - // For scanf in C90, look at the next character to see if this should - // be parsed as the GNU extension 'a' length modifier. If not, this - // will be parsed as a conversion specifier. - ++I; - if (I != E && (*I == 's' || *I == 'S' || *I == '[')) { - lmKind = LengthModifier::AsAllocate; - break; - } - --I; - } - return false; - case 'm': - if (IsScanf) { - lmKind = LengthModifier::AsMAllocate; - ++I; + } + return false; + // printf: AsInt64, AsInt32, AsInt3264 + // scanf: AsInt64 + case 'I': + if (I + 1 != E && I + 2 != E) { + if (FormatStrConverter.convert(I[1]) == '6' && + FormatStrConverter.convert(I[2]) == '4') { + I += 3; + lmKind = LengthModifier::AsInt64; break; } - return false; - // printf: AsInt64, AsInt32, AsInt3264 - // scanf: AsInt64 - case 'I': - if (I + 1 != E && I + 2 != E) { - if (I[1] == '6' && I[2] == '4') { - I += 3; - lmKind = LengthModifier::AsInt64; - break; - } - if (IsScanf) - return false; + if (IsScanf) + return false; - if (I[1] == '3' && I[2] == '2') { - I += 3; - lmKind = LengthModifier::AsInt32; - break; - } + if (FormatStrConverter.convert(I[1]) == '3' && + FormatStrConverter.convert(I[2]) == '2') { + I += 3; + lmKind = LengthModifier::AsInt32; + break; } - ++I; - lmKind = LengthModifier::AsInt3264; - break; - case 'w': - lmKind = LengthModifier::AsWide; ++I; break; + } + ++I; + lmKind = LengthModifier::AsInt3264; + break; + case 'w': + lmKind = LengthModifier::AsWide; + ++I; + break; } LengthModifier lm(lmPosition, lmKind); FS.setLengthModifier(lm); diff --git a/clang/lib/AST/FormatStringParsing.h b/clang/lib/AST/FormatStringParsing.h index 764e5d46394d7..7ad6d4b98d2ac 100644 --- a/clang/lib/AST/FormatStringParsing.h +++ b/clang/lib/AST/FormatStringParsing.h @@ -37,31 +37,38 @@ class UpdateOnReturn { namespace analyze_format_string { -OptionalAmount ParseAmount(const char *&Beg, const char *E); -OptionalAmount ParseNonPositionAmount(const char *&Beg, const char *E, - unsigned &argIndex); +OptionalAmount +ParseAmount(const char *&Beg, const char *E, + const llvm::TextEncodingConverter &FormatStrConverter); -OptionalAmount ParsePositionAmount(FormatStringHandler &H, - const char *Start, const char *&Beg, - const char *E, PositionContext p); +OptionalAmount +ParseNonPositionAmount(const char *&Beg, const char *E, unsigned &argIndex, + const llvm::TextEncodingConverter &FormatStrConverter); -bool ParseFieldWidth(FormatStringHandler &H, - FormatSpecifier &CS, +OptionalAmount +ParsePositionAmount(FormatStringHandler &H, const char *Start, const char *&Beg, + const char *E, PositionContext p, + const llvm::TextEncodingConverter &FormatStrConverter); + +bool ParseFieldWidth(FormatStringHandler &H, FormatSpecifier &CS, const char *Start, const char *&Beg, const char *E, - unsigned *argIndex); + ... [truncated] `````````` </details> https://github.com/llvm/llvm-project/pull/169803 _______________________________________________ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
