abhina.sreeskantharajan updated this revision to Diff 327470.
abhina.sreeskantharajan added a comment.
Thanks for the feedback! I haven't addressed all the comments yet but I've made
major renaming changes and hope to get feedback on it.
Repository:
rG LLVM Github Monorepo
CHANGES SINCE LAST ACTION
https://reviews.llvm.org/D93031/new/
https://reviews.llvm.org/D93031
Files:
clang/include/clang/Basic/LangOptions.h
clang/include/clang/Basic/TokenKinds.h
clang/include/clang/Driver/Options.td
clang/include/clang/Lex/LiteralConverter.h
clang/include/clang/Lex/LiteralSupport.h
clang/include/clang/Lex/Preprocessor.h
clang/lib/Driver/ToolChains/Clang.cpp
clang/lib/Frontend/CompilerInstance.cpp
clang/lib/Lex/CMakeLists.txt
clang/lib/Lex/LiteralConverter.cpp
clang/lib/Lex/LiteralSupport.cpp
clang/test/CodeGen/systemz-charset.c
clang/test/CodeGen/systemz-charset.cpp
clang/test/Driver/cl-options.c
clang/test/Driver/clang_f_opts.c
llvm/include/llvm/ADT/Triple.h
llvm/lib/Support/Triple.cpp
Index: llvm/lib/Support/Triple.cpp
===================================================================
--- llvm/lib/Support/Triple.cpp
+++ llvm/lib/Support/Triple.cpp
@@ -1038,6 +1038,13 @@
return Tmp.split('-').second; // Strip second component
}
+// System charset on z/OS is IBM-1047 and UTF-8 otherwise
+StringRef Triple::getSystemCharset() const {
+ if (getOS() == llvm::Triple::ZOS)
+ return "IBM-1047";
+ return "UTF-8";
+}
+
static unsigned EatNumber(StringRef &Str) {
assert(!Str.empty() && isDigit(Str[0]) && "Not a number");
unsigned Result = 0;
Index: llvm/include/llvm/ADT/Triple.h
===================================================================
--- llvm/include/llvm/ADT/Triple.h
+++ llvm/include/llvm/ADT/Triple.h
@@ -395,6 +395,9 @@
/// if the environment component is present).
StringRef getOSAndEnvironmentName() const;
+ /// getSystemCharset - Get the system charset of the triple.
+ StringRef getSystemCharset() const;
+
/// @}
/// @name Convenience Predicates
/// @{
Index: clang/test/Driver/clang_f_opts.c
===================================================================
--- clang/test/Driver/clang_f_opts.c
+++ clang/test/Driver/clang_f_opts.c
@@ -209,8 +209,14 @@
// RUN: %clang -### -S -finput-charset=iso-8859-1 -o /dev/null %s 2>&1 | FileCheck -check-prefix=CHECK-INVALID-CHARSET %s
// CHECK-INVALID-CHARSET: error: invalid value 'iso-8859-1' in '-finput-charset=iso-8859-1'
-// RUN: %clang -### -S -fexec-charset=iso-8859-1 -o /dev/null %s 2>&1 | FileCheck -check-prefix=CHECK-INVALID-INPUT-CHARSET %s
-// CHECK-INVALID-INPUT-CHARSET: error: invalid value 'iso-8859-1' in '-fexec-charset=iso-8859-1'
+// RUN: %clang -### -S -fexec-charset=invalid-charset -o /dev/null %s 2>&1 | FileCheck -check-prefix=CHECK-INVALID-INPUT-CHARSET %s
+// CHECK-INVALID-INPUT-CHARSET: error: invalid value 'invalid-charset' in '-fexec-charset'
+
+// Test that we support the following exec charsets.
+// RUN: %clang -### -S -fexec-charset=UTF-8 -o /dev/null %s 2>&1 | FileCheck --check-prefix=INVALID %s
+// RUN: %clang -### -S -fexec-charset=ISO8859-1 -o /dev/null %s 2>&1 | FileCheck --check-prefix=INVALID %s
+// RUN: %clang -### -S -fexec-charset=IBM-1047 -o /dev/null %s 2>&1 | FileCheck --check-prefix=INVALID %s
+// INVALID-NOT: error: invalid value
// Test that we don't error on these.
// RUN: %clang -### -S -Werror \
@@ -224,7 +230,7 @@
// RUN: -fident -fno-ident \
// RUN: -fimplicit-templates -fno-implicit-templates \
// RUN: -finput-charset=UTF-8 \
-// RUN: -fexec-charset=UTF-8 \
+// RUN: -fexec-charset=UTF-8 \
// RUN: -fivopts -fno-ivopts \
// RUN: -fnon-call-exceptions -fno-non-call-exceptions \
// RUN: -fpermissive -fno-permissive \
Index: clang/test/Driver/cl-options.c
===================================================================
--- clang/test/Driver/cl-options.c
+++ clang/test/Driver/cl-options.c
@@ -209,10 +209,11 @@
// RUN: %clang_cl /source-charset:utf-16 -### -- %s 2>&1 | FileCheck -check-prefix=source-charset-utf-16 %s
// source-charset-utf-16: invalid value 'utf-16' in '/source-charset:utf-16'
-// /execution-charset: should warn on everything except UTF-8.
-// RUN: %clang_cl /execution-charset:utf-16 -### -- %s 2>&1 | FileCheck -check-prefix=execution-charset-utf-16 %s
-// execution-charset-utf-16: invalid value 'utf-16' in '/execution-charset:utf-16'
+// /execution-charset: should warn on invalid charsets.
+// RUN: %clang_cl /execution-charset:invalid-charset -### -- %s 2>&1 | FileCheck -check-prefix=execution-charset-invalid %s
+// execution-charset-invalid: invalid value 'invalid-charset' in '-fexec-charset'
//
+
// RUN: %clang_cl /Umymacro -### -- %s 2>&1 | FileCheck -check-prefix=U %s
// RUN: %clang_cl /U mymacro -### -- %s 2>&1 | FileCheck -check-prefix=U %s
// U: "-U" "mymacro"
Index: clang/test/CodeGen/systemz-charset.cpp
===================================================================
--- /dev/null
+++ clang/test/CodeGen/systemz-charset.cpp
@@ -0,0 +1,41 @@
+// RUN: %clang %s -std=c++17 -emit-llvm -S -target s390x-ibm-zos -o - | FileCheck %s
+
+const char *RawString = R"(Hello\n)";
+//CHECK: c"\C8\85\93\93\96\E0\95\00"
+
+char UnicodeChar8 = u8'1';
+//CHECK: i8 49
+char16_t UnicodeChar16 = u'1';
+//CHECK: i16 49
+char32_t UnicodeChar32 = U'1';
+//CHECK: i32 49
+
+const char *EscapeCharacters8 = u8"\a\b\f\n\r\t\v\\\'\"\?";
+//CHECK: c"\07\08\0C\0A\0D\09\0B\\'\22?\00"
+
+const char16_t *EscapeCharacters16 = u"\a\b\f\n\r\t\v\\\'\"\?";
+//CHECK: [12 x i16] [i16 7, i16 8, i16 12, i16 10, i16 13, i16 9, i16 11, i16 92, i16 39, i16 34, i16 63, i16 0]
+
+const char32_t *EscapeCharacters32 = U"\a\b\f\n\r\t\v\\\'\"\?";
+//CHECK: [12 x i32] [i32 7, i32 8, i32 12, i32 10, i32 13, i32 9, i32 11, i32 92, i32 39, i32 34, i32 63, i32 0]
+
+const char *UnicodeString8 = u8"Hello";
+//CHECK: c"Hello\00"
+const char16_t *UnicodeString16 = u"Hello";
+//CHECK: [6 x i16] [i16 72, i16 101, i16 108, i16 108, i16 111, i16 0]
+const char32_t *UnicodeString32 = U"Hello";
+//CHECK: [6 x i32] [i32 72, i32 101, i32 108, i32 108, i32 111, i32 0]
+
+const char *UnicodeRawString8 = u8R"("Hello\")";
+//CHECK: c"\22Hello\\\22\00"
+const char16_t *UnicodeRawString16 = uR"("Hello\")";
+//CHECK: [9 x i16] [i16 34, i16 72, i16 101, i16 108, i16 108, i16 111, i16 92, i16 34, i16 0]
+const char32_t *UnicodeRawString32 = UR"("Hello\")";
+//CHECK: [9 x i32] [i32 34, i32 72, i32 101, i32 108, i32 108, i32 111, i32 92, i32 34, i32 0]
+
+const char *UnicodeUCNString8 = u8"\u00E2\u00AC\U000000DF";
+//CHECK: c"\C3\A2\C2\AC\C3\9F\00"
+const char16_t *UnicodeUCNString16 = u"\u00E2\u00AC\U000000DF";
+//CHECK: [4 x i16] [i16 226, i16 172, i16 223, i16 0]
+const char32_t *UnicodeUCNString32 = U"\u00E2\u00AC\U000000DF";
+//CHECK: [4 x i32] [i32 226, i32 172, i32 223, i32 0]
Index: clang/test/CodeGen/systemz-charset.c
===================================================================
--- /dev/null
+++ clang/test/CodeGen/systemz-charset.c
@@ -0,0 +1,35 @@
+// RUN: %clang_cc1 %s -emit-llvm -triple s390x-none-zos -fexec-charset IBM-1047 -o - | FileCheck %s
+// RUN: %clang %s -emit-llvm -S -target s390x-ibm-zos -o - | FileCheck %s
+
+const char *UpperCaseLetters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
+// CHECK: c"\C1\C2\C3\C4\C5\C6\C7\C8\C9\D1\D2\D3\D4\D5\D6\D7\D8\D9\E2\E3\E4\E5\E6\E7\E8\E9\00"
+
+const char *LowerCaseLetters = "abcdefghijklmnopqrstuvwxyz";
+//CHECK: c"\81\82\83\84\85\86\87\88\89\91\92\93\94\95\96\97\98\99\A2\A3\A4\A5\A6\A7\A8\A9\00"
+
+const char *Digits = "0123456789";
+// CHECK: c"\F0\F1\F2\F3\F4\F5\F6\F7\F8\F9\00"
+
+const char *SpecialCharacters = " .<(+|&!$*);^-/,%%_>`:#@=";
+// CHECK: c"@KLMNOPZ[\\]^_`akllmnyz{|~\00"
+
+const char *EscapeCharacters = "\a\b\f\n\r\t\v\\\'\"\?";
+//CHECK: c"/\16\0C\15\0D\05\0B\E0}\7Fo\00"
+
+const char *InvalidEscape = "\y\z";
+//CHECK: c"oo\00"
+
+const char *HexCharacters = "\x12\x13\x14";
+//CHECK: c"\12\13\14\00"
+
+const char *OctalCharacters = "\141\142\143";
+//CHECK: c"abc\00"
+
+const char singleChar = 'a';
+//CHECK: i8 -127
+
+const char *UcnCharacters = "\u00E2\u00AC\U000000DF";
+//CHECK: c"B\B0Y\00"
+
+const char *Unicode = "ÿ";
+//CHECK: c"\DF\00"
Index: clang/lib/Lex/LiteralSupport.cpp
===================================================================
--- clang/lib/Lex/LiteralSupport.cpp
+++ clang/lib/Lex/LiteralSupport.cpp
@@ -93,7 +93,8 @@
const char *ThisTokEnd, bool &HadError,
FullSourceLoc Loc, unsigned CharWidth,
DiagnosticsEngine *Diags,
- const LangOptions &Features) {
+ const LangOptions &Features,
+ llvm::CharSetConverter *Converter) {
const char *EscapeBegin = ThisTokBuf;
// Skip the '\' char.
@@ -102,6 +103,8 @@
// We know that this character can't be off the end of the buffer, because
// that would have been \", which would not have been the end of string.
unsigned ResultChar = *ThisTokBuf++;
+ bool Translate = true;
+ bool Invalid = false;
switch (ResultChar) {
// These map to themselves.
case '\\': case '\'': case '"': case '?': break;
@@ -142,6 +145,7 @@
ResultChar = 11;
break;
case 'x': { // Hex escape.
+ Translate = false;
ResultChar = 0;
if (ThisTokBuf == ThisTokEnd || !isHexDigit(*ThisTokBuf)) {
if (Diags)
@@ -179,6 +183,7 @@
case '4': case '5': case '6': case '7': {
// Octal escapes.
--ThisTokBuf;
+ Translate = false;
ResultChar = 0;
// Octal escapes are a series of octal digits with maximum length 3.
@@ -210,6 +215,7 @@
<< std::string(1, ResultChar);
break;
default:
+ Invalid = true;
if (!Diags)
break;
@@ -224,6 +230,15 @@
break;
}
+ if (Translate && Converter) {
+ // Invalid escapes are written as '?' and then translated.
+ char ByteChar = Invalid ? '?' : ResultChar;
+ SmallString<8> ResultCharConv;
+ Converter->convert(std::string(1, ByteChar), ResultCharConv);
+ assert(ResultCharConv.size() == 1 &&
+ "Char size increased after translation");
+ ResultChar = ResultCharConv[0];
+ }
return ResultChar;
}
@@ -1241,6 +1256,7 @@
HadError = false;
Kind = kind;
+ LiteralConverter *LT = &PP.getLiteralConverter();
const char *TokBegin = begin;
@@ -1302,6 +1318,10 @@
largest_character_for_kind = 0x7Fu;
}
+ llvm::CharSetConverter *Converter = nullptr;
+ if (!isUTFLiteral(Kind) && LT)
+ Converter = LT->getConverter(ToExecCharset);
+
while (begin != end) {
// Is this a span of non-escape characters?
if (begin[0] != '\\') {
@@ -1339,6 +1359,13 @@
HadError = true;
PP.Diag(Loc, diag::err_character_too_large);
}
+ if (!HadError && Converter) {
+ assert(Kind != tok::wide_char_constant &&
+ "Wide character translation not supported");
+ SmallString<1> ConvertedChar;
+ Converter->convert(StringRef((char *)tmp_out_start), ConvertedChar);
+ memmove((void *)tmp_out_start, ConvertedChar.data(), 1);
+ }
}
}
@@ -1361,9 +1388,9 @@
}
unsigned CharWidth = getCharWidth(Kind, PP.getTargetInfo());
uint64_t result =
- ProcessCharEscape(TokBegin, begin, end, HadError,
- FullSourceLoc(Loc,PP.getSourceManager()),
- CharWidth, &PP.getDiagnostics(), PP.getLangOpts());
+ ProcessCharEscape(TokBegin, begin, end, HadError,
+ FullSourceLoc(Loc, PP.getSourceManager()), CharWidth,
+ &PP.getDiagnostics(), PP.getLangOpts(), nullptr);
*buffer_begin++ = result;
}
@@ -1471,17 +1498,21 @@
/// hex-digit hex-digit hex-digit hex-digit
/// \endverbatim
///
-StringLiteralParser::
-StringLiteralParser(ArrayRef<Token> StringToks,
- Preprocessor &PP, bool Complain)
- : SM(PP.getSourceManager()), Features(PP.getLangOpts()),
- Target(PP.getTargetInfo()), Diags(Complain ? &PP.getDiagnostics() :nullptr),
- MaxTokenLength(0), SizeBound(0), CharByteWidth(0), Kind(tok::unknown),
- ResultPtr(ResultBuf.data()), hadError(false), Pascal(false) {
- init(StringToks);
+
+StringLiteralParser::StringLiteralParser(ArrayRef<Token> StringToks,
+ Preprocessor &PP, bool Complain,
+ ConversionAction Action)
+ : SM(PP.getSourceManager()), Features(PP.getLangOpts()),
+ Target(PP.getTargetInfo()),
+ Diags(Complain ? &PP.getDiagnostics() : nullptr),
+ LT(&PP.getLiteralConverter()), MaxTokenLength(0), SizeBound(0),
+ CharByteWidth(0), Kind(tok::unknown), ResultPtr(ResultBuf.data()),
+ hadError(false), Pascal(false) {
+ init(StringToks, Action);
}
-void StringLiteralParser::init(ArrayRef<Token> StringToks){
+void StringLiteralParser::init(ArrayRef<Token> StringToks,
+ ConversionAction Action) {
// The literal token may have come from an invalid source location (e.g. due
// to a PCH error), in which case the token length will be 0.
if (StringToks.empty() || StringToks[0].getLength() < 2)
@@ -1557,6 +1588,10 @@
SourceLocation UDSuffixTokLoc;
+ llvm::CharSetConverter *Converter = nullptr;
+ if (!isUTFLiteral(Kind) && LT)
+ Converter = LT->getConverter(Action);
+
for (unsigned i = 0, e = StringToks.size(); i != e; ++i) {
const char *ThisTokBuf = &TokenBuf[0];
// Get the spelling of the token, which eliminates trigraphs, etc. We know
@@ -1652,6 +1687,16 @@
if (CopyStringFragment(StringToks[i], ThisTokBegin, BeforeCRLF))
hadError = true;
+ if (!hadError && Converter) {
+ assert(Kind != tok::wide_string_literal &&
+ "Wide character translation not supported");
+ SmallString<256> CpConv;
+ int ResultLength = BeforeCRLF.size() * CharByteWidth;
+ char *Cp = ResultPtr - ResultLength;
+ Converter->convert(StringRef(Cp, ResultLength), CpConv);
+ memmove(Cp, CpConv.data(), ResultLength);
+ ResultPtr = Cp + CpConv.size();
+ }
// Point into the \n inside the \r\n sequence and operate on the
// remaining portion of the literal.
RemainingTokenSpan = AfterCRLF.substr(1);
@@ -1685,25 +1730,45 @@
++ThisTokBuf;
} while (ThisTokBuf != ThisTokEnd && ThisTokBuf[0] != '\\');
+ int Length = ThisTokBuf - InStart;
// Copy the character span over.
if (CopyStringFragment(StringToks[i], ThisTokBegin,
StringRef(InStart, ThisTokBuf - InStart)))
hadError = true;
+
+ if (!hadError && Converter) {
+ assert(Kind != tok::wide_string_literal &&
+ "Wide character translation not supported");
+ SmallString<256> CpConv;
+ int ResultLength = Length * CharByteWidth;
+ char *Cp = ResultPtr - ResultLength;
+ Converter->convert(StringRef(Cp, ResultLength), CpConv);
+ memmove(Cp, CpConv.data(), ResultLength);
+ ResultPtr = Cp + CpConv.size();
+ }
continue;
}
// Is this a Universal Character Name escape?
if (ThisTokBuf[1] == 'u' || ThisTokBuf[1] == 'U') {
- EncodeUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd,
- ResultPtr, hadError,
+ char *Cp = ResultPtr;
+ EncodeUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, ResultPtr,
+ hadError,
FullSourceLoc(StringToks[i].getLocation(), SM),
CharByteWidth, Diags, Features);
+
+ if (!hadError && Converter) {
+ SmallString<8> CpConv;
+ Converter->convert(StringRef(Cp), CpConv);
+ memmove(Cp, CpConv.data(), CpConv.size());
+ ResultPtr = Cp + CpConv.size();
+ }
continue;
}
// Otherwise, this is a non-UCN escape character. Process it.
unsigned ResultChar =
- ProcessCharEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, hadError,
- FullSourceLoc(StringToks[i].getLocation(), SM),
- CharByteWidth*8, Diags, Features);
+ ProcessCharEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, hadError,
+ FullSourceLoc(StringToks[i].getLocation(), SM),
+ CharByteWidth * 8, Diags, Features, Converter);
if (CharByteWidth == 4) {
// FIXME: Make the type of the result buffer correct instead of
@@ -1897,8 +1962,8 @@
ByteNo -= Len;
} else {
ProcessCharEscape(SpellingStart, SpellingPtr, SpellingEnd, HadError,
- FullSourceLoc(Tok.getLocation(), SM),
- CharByteWidth*8, Diags, Features);
+ FullSourceLoc(Tok.getLocation(), SM), CharByteWidth * 8,
+ Diags, Features, nullptr);
--ByteNo;
}
assert(!HadError && "This method isn't valid on erroneous strings");
Index: clang/lib/Lex/LiteralConverter.cpp
===================================================================
--- /dev/null
+++ clang/lib/Lex/LiteralConverter.cpp
@@ -0,0 +1,68 @@
+//===--- LiteralConverter.cpp - Translator for String Literals -----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/Lex/LiteralConverter.h"
+#include "clang/Basic/DiagnosticDriver.h"
+
+using namespace llvm;
+
+llvm::CharSetConverter *LiteralConverter::getConverter(const char *Codepage) {
+ auto Iter = CharsetConverters.find(Codepage);
+ if (Iter != CharsetConverters.end())
+ return &Iter->second;
+ return nullptr;
+}
+
+llvm::CharSetConverter *
+LiteralConverter::getConverter(ConversionAction Action) {
+ StringRef CodePage;
+ if (Action == ToSystemCharset)
+ CodePage = SystemCharset;
+ else if (Action == ToExecCharset)
+ CodePage = ExecCharset;
+ else
+ CodePage = InternalCharset;
+ return getConverter(CodePage.data());
+}
+
+llvm::CharSetConverter *
+LiteralConverter::createAndInsertCharConverter(const char *To) {
+ const char *From = InternalCharset.data();
+ llvm::CharSetConverter *Converter = getConverter(To);
+ if (Converter)
+ return Converter;
+
+ ErrorOr<CharSetConverter> ErrorOrConverter =
+ llvm::CharSetConverter::create(From, To);
+ if (!ErrorOrConverter)
+ return nullptr;
+ CharsetConverters.insert_or_assign(StringRef(To),
+ std::move(*ErrorOrConverter));
+ return Converter;
+}
+
+void LiteralConverter::setConvertersFromOptions(
+ const clang::LangOptions &Opts, const clang::TargetInfo &TInfo,
+ clang::DiagnosticsEngine &Diags) {
+ using namespace llvm;
+ SystemCharset = TInfo.getTriple().getSystemCharset();
+ InternalCharset = "UTF-8";
+ ExecCharset = Opts.ExecCharset.empty() ? InternalCharset : Opts.ExecCharset;
+ // Create converter between internal and system charset
+ if (!InternalCharset.equals(SystemCharset))
+ createAndInsertCharConverter(SystemCharset.data());
+
+ // Create converter between internal and exec charset specified
+ // in fexec-charset option.
+ if (InternalCharset.equals(ExecCharset))
+ return;
+ if (!createAndInsertCharConverter(ExecCharset.data())) {
+ Diags.Report(clang::diag::err_drv_invalid_value)
+ << "-fexec-charset" << ExecCharset;
+ }
+}
Index: clang/lib/Lex/CMakeLists.txt
===================================================================
--- clang/lib/Lex/CMakeLists.txt
+++ clang/lib/Lex/CMakeLists.txt
@@ -7,6 +7,7 @@
HeaderMap.cpp
HeaderSearch.cpp
Lexer.cpp
+ LiteralConverter.cpp
LiteralSupport.cpp
MacroArgs.cpp
MacroInfo.cpp
Index: clang/lib/Frontend/CompilerInstance.cpp
===================================================================
--- clang/lib/Frontend/CompilerInstance.cpp
+++ clang/lib/Frontend/CompilerInstance.cpp
@@ -12,6 +12,7 @@
#include "clang/AST/Decl.h"
#include "clang/Basic/CharInfo.h"
#include "clang/Basic/Diagnostic.h"
+#include "clang/Basic/DiagnosticDriver.h"
#include "clang/Basic/FileManager.h"
#include "clang/Basic/LangStandard.h"
#include "clang/Basic/SourceManager.h"
@@ -29,6 +30,7 @@
#include "clang/Frontend/Utils.h"
#include "clang/Frontend/VerifyDiagnosticConsumer.h"
#include "clang/Lex/HeaderSearch.h"
+#include "clang/Lex/LiteralConverter.h"
#include "clang/Lex/Preprocessor.h"
#include "clang/Lex/PreprocessorOptions.h"
#include "clang/Sema/CodeCompleteConsumer.h"
@@ -527,6 +529,8 @@
/*ShowAllHeaders=*/true, /*OutputPath=*/"",
/*ShowDepth=*/true, /*MSStyle=*/true);
}
+ PP->getLiteralConverter().setConvertersFromOptions(getLangOpts(), getTarget(),
+ getDiagnostics());
}
std::string CompilerInstance::getSpecificModuleCachePath(StringRef ModuleHash) {
Index: clang/lib/Driver/ToolChains/Clang.cpp
===================================================================
--- clang/lib/Driver/ToolChains/Clang.cpp
+++ clang/lib/Driver/ToolChains/Clang.cpp
@@ -35,6 +35,7 @@
#include "llvm/ADT/StringExtras.h"
#include "llvm/Config/llvm-config.h"
#include "llvm/Option/ArgList.h"
+#include "llvm/Support/CharSet.h"
#include "llvm/Support/CodeGen.h"
#include "llvm/Support/Compiler.h"
#include "llvm/Support/Compression.h"
@@ -6168,12 +6169,21 @@
<< value;
}
- // -fexec_charset=UTF-8 is default. Reject others
- if (Arg *execCharset = Args.getLastArg(options::OPT_fexec_charset_EQ)) {
- StringRef value = execCharset->getValue();
- if (!value.equals_lower("utf-8"))
- D.Diag(diag::err_drv_invalid_value) << execCharset->getAsString(Args)
- << value;
+ // Pass all -fexec-charset options to cc1.
+ std::vector<std::string> vList =
+ Args.getAllArgValues(options::OPT_fexec_charset_EQ);
+ // Set the default fexec-charset as the system charset.
+ CmdArgs.push_back("-fexec-charset");
+ CmdArgs.push_back(Args.MakeArgString(Triple.getSystemCharset()));
+ for (auto it = vList.begin(), ie = vList.end(); it != ie; ++it) {
+ llvm::ErrorOr<llvm::CharSetConverter> ErrorOrConverter =
+ llvm::CharSetConverter::create("UTF-8", it->c_str());
+ if (ErrorOrConverter) {
+ CmdArgs.push_back("-fexec-charset");
+ CmdArgs.push_back(Args.MakeArgString(*it));
+ } else {
+ D.Diag(clang::diag::err_drv_invalid_value) << "-fexec-charset" << *it;
+ }
}
RenderDiagnosticsOptions(D, Args, CmdArgs);
Index: clang/include/clang/Lex/Preprocessor.h
===================================================================
--- clang/include/clang/Lex/Preprocessor.h
+++ clang/include/clang/Lex/Preprocessor.h
@@ -23,6 +23,7 @@
#include "clang/Basic/SourceManager.h"
#include "clang/Basic/TokenKinds.h"
#include "clang/Lex/Lexer.h"
+#include "clang/Lex/LiteralConverter.h"
#include "clang/Lex/MacroInfo.h"
#include "clang/Lex/ModuleLoader.h"
#include "clang/Lex/ModuleMap.h"
@@ -141,6 +142,7 @@
std::unique_ptr<ScratchBuffer> ScratchBuf;
HeaderSearch &HeaderInfo;
ModuleLoader &TheModuleLoader;
+ LiteralConverter LT;
/// External source of macros.
ExternalPreprocessorSource *ExternalSource;
@@ -931,6 +933,7 @@
SelectorTable &getSelectorTable() { return Selectors; }
Builtin::Context &getBuiltinInfo() { return *BuiltinInfo; }
llvm::BumpPtrAllocator &getPreprocessorAllocator() { return BP; }
+ LiteralConverter &getLiteralConverter() { return LT; }
void setExternalSource(ExternalPreprocessorSource *Source) {
ExternalSource = Source;
Index: clang/include/clang/Lex/LiteralSupport.h
===================================================================
--- clang/include/clang/Lex/LiteralSupport.h
+++ clang/include/clang/Lex/LiteralSupport.h
@@ -17,10 +17,12 @@
#include "clang/Basic/CharInfo.h"
#include "clang/Basic/LLVM.h"
#include "clang/Basic/TokenKinds.h"
+#include "clang/Lex/LiteralConverter.h"
#include "llvm/ADT/APFloat.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/SmallString.h"
#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/CharSet.h"
#include "llvm/Support/DataTypes.h"
namespace clang {
@@ -211,6 +213,7 @@
const LangOptions &Features;
const TargetInfo &Target;
DiagnosticsEngine *Diags;
+ LiteralConverter *LT;
unsigned MaxTokenLength;
unsigned SizeBound;
@@ -222,16 +225,17 @@
unsigned UDSuffixToken;
unsigned UDSuffixOffset;
public:
- StringLiteralParser(ArrayRef<Token> StringToks,
- Preprocessor &PP, bool Complain = true);
+ StringLiteralParser(ArrayRef<Token> StringToks, Preprocessor &PP,
+ bool Complain = true,
+ ConversionAction Action = ToExecCharset);
StringLiteralParser(ArrayRef<Token> StringToks,
const SourceManager &sm, const LangOptions &features,
const TargetInfo &target,
DiagnosticsEngine *diags = nullptr)
- : SM(sm), Features(features), Target(target), Diags(diags),
- MaxTokenLength(0), SizeBound(0), CharByteWidth(0), Kind(tok::unknown),
- ResultPtr(ResultBuf.data()), hadError(false), Pascal(false) {
- init(StringToks);
+ : SM(sm), Features(features), Target(target), Diags(diags), LT(nullptr),
+ MaxTokenLength(0), SizeBound(0), CharByteWidth(0), Kind(tok::unknown),
+ ResultPtr(ResultBuf.data()), hadError(false), Pascal(false) {
+ init(StringToks, NoConversion);
}
@@ -277,7 +281,7 @@
static bool isValidUDSuffix(const LangOptions &LangOpts, StringRef Suffix);
private:
- void init(ArrayRef<Token> StringToks);
+ void init(ArrayRef<Token> StringToks, ConversionAction Action);
bool CopyStringFragment(const Token &Tok, const char *TokBegin,
StringRef Fragment);
void DiagnoseLexingError(SourceLocation Loc);
Index: clang/include/clang/Lex/LiteralConverter.h
===================================================================
--- /dev/null
+++ clang/include/clang/Lex/LiteralConverter.h
@@ -0,0 +1,36 @@
+//===--- clang/Lex/LiteralConverter.h - Translator for Literals -*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_LEX_LITERALCONVERTER_H
+#define LLVM_CLANG_LEX_LITERALCONVERTER_H
+
+#include "clang/Basic/Diagnostic.h"
+#include "clang/Basic/LangOptions.h"
+#include "clang/Basic/TargetInfo.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/CharSet.h"
+
+enum ConversionAction { NoConversion, ToSystemCharset, ToExecCharset };
+
+class LiteralConverter {
+ llvm::StringRef InternalCharset;
+ llvm::StringRef SystemCharset;
+ llvm::StringRef ExecCharset;
+ llvm::StringMap<llvm::CharSetConverter> CharsetConverters;
+
+public:
+ llvm::CharSetConverter *getConverter(const char *Codepage);
+ llvm::CharSetConverter *getConverter(ConversionAction Action);
+ llvm::CharSetConverter *createAndInsertCharConverter(const char *To);
+ void setConvertersFromOptions(const clang::LangOptions &Opts,
+ const clang::TargetInfo &TInfo,
+ clang::DiagnosticsEngine &Diags);
+};
+
+#endif
Index: clang/include/clang/Driver/Options.td
===================================================================
--- clang/include/clang/Driver/Options.td
+++ clang/include/clang/Driver/Options.td
@@ -4353,6 +4353,11 @@
let Flags = [CC1Option, CC1AsOption, NoDriverOption] in {
+def fexec_charset : Separate<["-"], "fexec-charset">, MetaVarName<"<charset>">,
+ HelpText<"Set the execution <charset> for string and character literals. "
+ "Supported character encodings include ISO8859-1, UTF-8, IBM-1047 "
+ "and those supported by the host iconv library.">,
+ MarshallingInfoString<LangOpts<"ExecCharset">>;
def target_cpu : Separate<["-"], "target-cpu">,
HelpText<"Target a specific cpu type">,
MarshallingInfoString<TargetOpts<"CPU">>;
Index: clang/include/clang/Basic/TokenKinds.h
===================================================================
--- clang/include/clang/Basic/TokenKinds.h
+++ clang/include/clang/Basic/TokenKinds.h
@@ -90,6 +90,13 @@
isStringLiteral(K) || K == tok::header_name;
}
+/// Return true if this is a utf literal kind.
+inline bool isUTFLiteral(TokenKind K) {
+ return K == tok::utf8_char_constant || K == tok::utf8_string_literal ||
+ K == tok::utf16_char_constant || K == tok::utf16_string_literal ||
+ K == tok::utf32_char_constant || K == tok::utf32_string_literal;
+}
+
/// Return true if this is any of tok::annot_* kinds.
bool isAnnotation(TokenKind K);
Index: clang/include/clang/Basic/LangOptions.h
===================================================================
--- clang/include/clang/Basic/LangOptions.h
+++ clang/include/clang/Basic/LangOptions.h
@@ -341,6 +341,9 @@
/// input is a header file (i.e. -x c-header).
bool IsHeaderFile = false;
+ /// Name of the exec charset to convert the internal charset to.
+ std::string ExecCharset;
+
LangOptions();
// Define accessors/mutators for language options of enumeration type.
_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits