abhina.sreeskantharajan updated this revision to Diff 311911.
abhina.sreeskantharajan added a comment.

Thanks for your quick reviews! I haven't addressed all the comments yet but I 
plan to address all of them. I put up this patch early because it has a few 
major changes:

- moves LiteralTranslator class to Preprocessor instead of being a static 
global class
- add isUTFLiteral() function to detect strings like u8"..." and stop 
translation
- translate wide string literals to the system charset for now (we don't have 
an implementation plan for -fwide-charset right now)
- remove tests that check fexec-charset will not accept non-UTF charsets




Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D93031/new/

https://reviews.llvm.org/D93031

Files:
  clang/include/clang/Basic/LangOptions.h
  clang/include/clang/Basic/TokenKinds.h
  clang/include/clang/Driver/Options.td
  clang/include/clang/Lex/LiteralSupport.h
  clang/include/clang/Lex/LiteralTranslator.h
  clang/include/clang/Lex/Preprocessor.h
  clang/lib/Driver/ToolChains/Clang.cpp
  clang/lib/Frontend/CompilerInstance.cpp
  clang/lib/Frontend/CompilerInvocation.cpp
  clang/lib/Lex/CMakeLists.txt
  clang/lib/Lex/LiteralSupport.cpp
  clang/lib/Lex/LiteralTranslator.cpp
  clang/lib/Lex/Preprocessor.cpp
  clang/test/CodeGen/systemz-charset.c
  clang/test/Driver/cl-options.c
  clang/test/Driver/clang_f_opts.c
  llvm/include/llvm/ADT/Triple.h
  llvm/lib/Support/Triple.cpp

Index: llvm/lib/Support/Triple.cpp
===================================================================
--- llvm/lib/Support/Triple.cpp
+++ llvm/lib/Support/Triple.cpp
@@ -1023,6 +1023,13 @@
   return Tmp.split('-').second;                      // Strip second component
 }
 
+// System charset on z/OS is IBM-1047 and UTF-8 otherwise
+StringRef Triple::getSystemCharset() const {
+  if (getOS() == llvm::Triple::ZOS)
+    return "IBM-1047";
+  return "UTF-8";
+}
+
 static unsigned EatNumber(StringRef &Str) {
   assert(!Str.empty() && Str[0] >= '0' && Str[0] <= '9' && "Not a number");
   unsigned Result = 0;
Index: llvm/include/llvm/ADT/Triple.h
===================================================================
--- llvm/include/llvm/ADT/Triple.h
+++ llvm/include/llvm/ADT/Triple.h
@@ -390,6 +390,9 @@
   /// if the environment component is present).
   StringRef getOSAndEnvironmentName() const;
 
+  /// getSystemCharset - Get the system charset of the triple.
+  StringRef getSystemCharset() const;
+
   /// @}
   /// @name Convenience Predicates
   /// @{
Index: clang/test/Driver/clang_f_opts.c
===================================================================
--- clang/test/Driver/clang_f_opts.c
+++ clang/test/Driver/clang_f_opts.c
@@ -209,9 +209,6 @@
 // RUN: %clang -### -S -finput-charset=iso-8859-1 -o /dev/null %s 2>&1 | FileCheck -check-prefix=CHECK-INVALID-CHARSET %s
 // CHECK-INVALID-CHARSET: error: invalid value 'iso-8859-1' in '-finput-charset=iso-8859-1'
 
-// RUN: %clang -### -S -fexec-charset=iso-8859-1 -o /dev/null %s 2>&1 | FileCheck -check-prefix=CHECK-INVALID-INPUT-CHARSET %s
-// CHECK-INVALID-INPUT-CHARSET: error: invalid value 'iso-8859-1' in '-fexec-charset=iso-8859-1'
-
 // Test that we don't error on these.
 // RUN: %clang -### -S -Werror                                                \
 // RUN:     -falign-functions -falign-functions=2 -fno-align-functions        \
Index: clang/test/Driver/cl-options.c
===================================================================
--- clang/test/Driver/cl-options.c
+++ clang/test/Driver/cl-options.c
@@ -209,10 +209,6 @@
 // RUN: %clang_cl /source-charset:utf-16 -### -- %s 2>&1 | FileCheck -check-prefix=source-charset-utf-16 %s
 // source-charset-utf-16: invalid value 'utf-16' in '/source-charset:utf-16'
 
-// /execution-charset: should warn on everything except UTF-8.
-// RUN: %clang_cl /execution-charset:utf-16 -### -- %s 2>&1 | FileCheck -check-prefix=execution-charset-utf-16 %s
-// execution-charset-utf-16: invalid value 'utf-16' in '/execution-charset:utf-16'
-//
 // RUN: %clang_cl /Umymacro -### -- %s 2>&1 | FileCheck -check-prefix=U %s
 // RUN: %clang_cl /U mymacro -### -- %s 2>&1 | FileCheck -check-prefix=U %s
 // U: "-U" "mymacro"
Index: clang/test/CodeGen/systemz-charset.c
===================================================================
--- /dev/null
+++ clang/test/CodeGen/systemz-charset.c
@@ -0,0 +1,26 @@
+// RUN: %clang_cc1 %s -emit-llvm -triple s390x-none-zos -fexec-charset IBM-1047 -o - | FileCheck %s
+// RUN: %clang %s -emit-llvm -S -target s390x-ibm-zos -o - | FileCheck %s
+
+char *UpperCaseLetters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
+// CHECK: c"\C1\C2\C3\C4\C5\C6\C7\C8\C9\D1\D2\D3\D4\D5\D6\D7\D8\D9\E2\E3\E4\E5\E6\E7\E8\E9\00"
+
+char *LowerCaseLetters = "abcdefghijklmnopqrstuvwxyz";
+//CHECK: c"\81\82\83\84\85\86\87\88\89\91\92\93\94\95\96\97\98\99\A2\A3\A4\A5\A6\A7\A8\A9\00"
+
+char *Digits = "0123456789";
+// CHECK: c"\F0\F1\F2\F3\F4\F5\F6\F7\F8\F9\00"
+
+char *SpecialCharacters = " .<(+|&!$*);^-/,%%_>`:#@=";
+// CHECK: c"@KLMNOPZ[\\]^_`akllmnyz{|~\00"
+
+char *EscapeCharacters = "\a\b\f\n\r\t\v\\\'\"\?";
+//CHECK: c"/\16\0C\15\0D\05\0B\E0}\7Fo\00"
+
+char *HexCharacters = "\x12\x13\x14";
+//CHECK: c"\12\13\14\00"
+
+char *OctalCharacters = "\141\142\143";
+//CHECK: c"abc\00"
+
+char singleChar = 'a';
+//CHECK: i8 -127
Index: clang/lib/Lex/Preprocessor.cpp
===================================================================
--- clang/lib/Lex/Preprocessor.cpp
+++ clang/lib/Lex/Preprocessor.cpp
@@ -85,7 +85,8 @@
     : PPOpts(std::move(PPOpts)), Diags(&diags), LangOpts(opts),
       FileMgr(Headers.getFileMgr()), SourceMgr(SM),
       ScratchBuf(new ScratchBuffer(SourceMgr)), HeaderInfo(Headers),
-      TheModuleLoader(TheModuleLoader), ExternalSource(nullptr),
+      TheModuleLoader(TheModuleLoader), LT(new LiteralTranslator()),
+      ExternalSource(nullptr),
       // As the language options may have not been loaded yet (when
       // deserializing an ASTUnit), adding keywords to the identifier table is
       // deferred to Preprocessor::Initialize().
Index: clang/lib/Lex/LiteralTranslator.cpp
===================================================================
--- /dev/null
+++ clang/lib/Lex/LiteralTranslator.cpp
@@ -0,0 +1,70 @@
+//===--- LiteralTranslator.cpp - Translator for String Literals -----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/Lex/LiteralTranslator.h"
+#include "clang/Basic/DiagnosticDriver.h"
+
+using namespace llvm;
+
+llvm::CharSetConverter *
+LiteralTranslator::getConversionTable(const char *Codepage) {
+  auto TableIter = ExecCharsetTables.find(Codepage);
+  if (TableIter != ExecCharsetTables.end())
+    return &TableIter->second;
+  return nullptr;
+}
+
+CharsetTableStatusCode
+LiteralTranslator::findOrCreateExecCharsetTable(const char *To) {
+  const char *From = InternalCharset.data();
+  llvm::CharSetConverter *Converter = getConversionTable(To);
+  if (Converter)
+    return CharsetTableOk;
+
+  ErrorOr<CharSetConverter> ErrorOrConverter =
+      llvm::CharSetConverter::create(From, To);
+  if (!ErrorOrConverter)
+    return InvalidCharsetTable;
+  ExecCharsetTables.insert_or_assign(StringRef(To),
+                                     std::move(*ErrorOrConverter));
+  return CharsetTableOk;
+}
+
+llvm::CharSetConverter *
+LiteralTranslator::getCharConversionTable(ConversionState TranslationState) {
+  StringRef CodePage;
+  if (TranslationState == TranslateToSystemCharset)
+    CodePage = SystemCharset;
+  else if (TranslationState == TranslateToExecCharset)
+    CodePage = ExecCharset;
+  else
+    CodePage = InternalCharset;
+  return getConversionTable(CodePage.data());
+}
+
+void LiteralTranslator::setTranslationTables(const clang::LangOptions &Opts,
+                                             const clang::TargetInfo &TInfo,
+                                             clang::DiagnosticsEngine &Diags) {
+  using namespace llvm;
+  SystemCharset = TInfo.getTriple().getSystemCharset();
+  InternalCharset = "UTF-8";
+  ExecCharset = Opts.ExecCharset.empty() ? InternalCharset : Opts.ExecCharset;
+  // Create translation table between internal and system charset
+  if (!InternalCharset.equals(SystemCharset))
+    findOrCreateExecCharsetTable(SystemCharset.data());
+
+  // Create translation table between internal and exec charset specified
+  // in fexec-charset option.
+  if (InternalCharset.equals(ExecCharset))
+    return;
+  CharsetTableStatusCode RC = findOrCreateExecCharsetTable(ExecCharset.data());
+
+  if (RC != CharsetTableOk)
+    Diags.Report(clang::diag::err_drv_invalid_value)
+        << "-fexec-charset" << ExecCharset;
+}
Index: clang/lib/Lex/LiteralSupport.cpp
===================================================================
--- clang/lib/Lex/LiteralSupport.cpp
+++ clang/lib/Lex/LiteralSupport.cpp
@@ -93,7 +93,8 @@
                                   const char *ThisTokEnd, bool &HadError,
                                   FullSourceLoc Loc, unsigned CharWidth,
                                   DiagnosticsEngine *Diags,
-                                  const LangOptions &Features) {
+                                  const LangOptions &Features,
+                                  llvm::CharSetConverter *Converter) {
   const char *EscapeBegin = ThisTokBuf;
 
   // Skip the '\' char.
@@ -102,6 +103,7 @@
   // We know that this character can't be off the end of the buffer, because
   // that would have been \", which would not have been the end of string.
   unsigned ResultChar = *ThisTokBuf++;
+  bool Translate = true;
   switch (ResultChar) {
   // These map to themselves.
   case '\\': case '\'': case '"': case '?': break;
@@ -142,6 +144,7 @@
     ResultChar = 11;
     break;
   case 'x': { // Hex escape.
+    Translate = false;
     ResultChar = 0;
     if (ThisTokBuf == ThisTokEnd || !isHexDigit(*ThisTokBuf)) {
       if (Diags)
@@ -179,6 +182,7 @@
   case '4': case '5': case '6': case '7': {
     // Octal escapes.
     --ThisTokBuf;
+    Translate = false;
     ResultChar = 0;
 
     // Octal escapes are a series of octal digits with maximum length 3.
@@ -224,6 +228,16 @@
     break;
   }
 
+  if (Translate && Converter) {
+    // ResultChar is either UTF-8 or ASCII literal and can only be converted
+    // to EBCDIC on z/OS if the character can be represented in one byte.
+    if (ResultChar < 0x100) {
+      SmallString<8> ResultCharConv;
+      Converter->convert(StringRef((char *)&ResultChar), ResultCharConv);
+      void *Pointer = &ResultChar;
+      memcpy(Pointer, ResultCharConv.data(), sizeof(unsigned));
+    }
+  }
   return ResultChar;
 }
 
@@ -1236,11 +1250,13 @@
 ///
 CharLiteralParser::CharLiteralParser(const char *begin, const char *end,
                                      SourceLocation Loc, Preprocessor &PP,
-                                     tok::TokenKind kind) {
+                                     tok::TokenKind kind,
+                                     ConversionState translationState) {
   // At this point we know that the character matches the regex "(L|u|U)?'.*'".
   HadError = false;
 
   Kind = kind;
+  LiteralTranslator *LT = PP.getLiteralTranslator();
 
   const char *TokBegin = begin;
 
@@ -1302,6 +1318,15 @@
     largest_character_for_kind = 0x7Fu;
   }
 
+  TranslationState = translationState;
+  if (Kind == tok::wide_string_literal)
+    TranslationState = TranslateToSystemCharset;
+  else if (isUTFLiteral(Kind))
+    TranslationState = NoTranslation;
+
+  llvm::CharSetConverter *Converter =
+      LT ? LT->getCharConversionTable(TranslationState) : nullptr;
+
   while (begin != end) {
     // Is this a span of non-escape characters?
     if (begin[0] != '\\') {
@@ -1339,6 +1364,11 @@
             HadError = true;
             PP.Diag(Loc, diag::err_character_too_large);
           }
+          if (!HadError && Converter) {
+            SmallString<1> ConvertedChar;
+            Converter->convert(StringRef((char *)tmp_out_start), ConvertedChar);
+            memmove((void *)tmp_out_start, ConvertedChar.data(), 1);
+          }
         }
       }
 
@@ -1361,9 +1391,9 @@
     }
     unsigned CharWidth = getCharWidth(Kind, PP.getTargetInfo());
     uint64_t result =
-      ProcessCharEscape(TokBegin, begin, end, HadError,
-                        FullSourceLoc(Loc,PP.getSourceManager()),
-                        CharWidth, &PP.getDiagnostics(), PP.getLangOpts());
+        ProcessCharEscape(TokBegin, begin, end, HadError,
+                          FullSourceLoc(Loc, PP.getSourceManager()), CharWidth,
+                          &PP.getDiagnostics(), PP.getLangOpts(), nullptr);
     *buffer_begin++ = result;
   }
 
@@ -1471,13 +1501,16 @@
 ///         hex-digit hex-digit hex-digit hex-digit
 /// \endverbatim
 ///
-StringLiteralParser::
-StringLiteralParser(ArrayRef<Token> StringToks,
-                    Preprocessor &PP, bool Complain)
-  : SM(PP.getSourceManager()), Features(PP.getLangOpts()),
-    Target(PP.getTargetInfo()), Diags(Complain ? &PP.getDiagnostics() :nullptr),
-    MaxTokenLength(0), SizeBound(0), CharByteWidth(0), Kind(tok::unknown),
-    ResultPtr(ResultBuf.data()), hadError(false), Pascal(false) {
+
+StringLiteralParser::StringLiteralParser(ArrayRef<Token> StringToks,
+                                         Preprocessor &PP, bool Complain,
+                                         ConversionState translationState)
+    : SM(PP.getSourceManager()), Features(PP.getLangOpts()),
+      Target(PP.getTargetInfo()),
+      Diags(Complain ? &PP.getDiagnostics() : nullptr),
+      LT(PP.getLiteralTranslator()), MaxTokenLength(0), SizeBound(0),
+      CharByteWidth(0), Kind(tok::unknown), ResultPtr(ResultBuf.data()),
+      hadError(false), Pascal(false), TranslationState(translationState) {
   init(StringToks);
 }
 
@@ -1557,6 +1590,15 @@
 
   SourceLocation UDSuffixTokLoc;
 
+  ConversionState State = TranslationState;
+  if (Kind == tok::wide_string_literal)
+    State = TranslateToSystemCharset;
+  else if (isUTFLiteral(Kind))
+    State = NoTranslation;
+
+  llvm::CharSetConverter *Converter =
+      LT ? LT->getCharConversionTable(State) : nullptr;
+
   for (unsigned i = 0, e = StringToks.size(); i != e; ++i) {
     const char *ThisTokBuf = &TokenBuf[0];
     // Get the spelling of the token, which eliminates trigraphs, etc.  We know
@@ -1652,6 +1694,13 @@
         if (CopyStringFragment(StringToks[i], ThisTokBegin, BeforeCRLF))
           hadError = true;
 
+        if (!hadError && Converter) {
+          SmallString<256> CpConv;
+          int ResultLength = BeforeCRLF.size() * CharByteWidth;
+          unsigned char *Cp = (unsigned char *)ResultPtr - ResultLength;
+          Converter->convert(StringRef((char *)Cp, ResultLength), CpConv);
+          memmove(Cp, CpConv.data(), ResultLength);
+        }
         // Point into the \n inside the \r\n sequence and operate on the
         // remaining portion of the literal.
         RemainingTokenSpan = AfterCRLF.substr(1);
@@ -1685,10 +1734,19 @@
             ++ThisTokBuf;
           } while (ThisTokBuf != ThisTokEnd && ThisTokBuf[0] != '\\');
 
+          int Length = ThisTokBuf - InStart;
           // Copy the character span over.
           if (CopyStringFragment(StringToks[i], ThisTokBegin,
                                  StringRef(InStart, ThisTokBuf - InStart)))
             hadError = true;
+
+          if (!hadError && Converter) {
+            SmallString<256> CpConv;
+            int ResultLength = Length * CharByteWidth;
+            unsigned char *Cp = (unsigned char *)ResultPtr - ResultLength;
+            Converter->convert(StringRef((char *)Cp, ResultLength), CpConv);
+            memmove(Cp, CpConv.data(), ResultLength);
+          }
           continue;
         }
         // Is this a Universal Character Name escape?
@@ -1701,9 +1759,9 @@
         }
         // Otherwise, this is a non-UCN escape character.  Process it.
         unsigned ResultChar =
-          ProcessCharEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, hadError,
-                            FullSourceLoc(StringToks[i].getLocation(), SM),
-                            CharByteWidth*8, Diags, Features);
+            ProcessCharEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, hadError,
+                              FullSourceLoc(StringToks[i].getLocation(), SM),
+                              CharByteWidth * 8, Diags, Features, Converter);
 
         if (CharByteWidth == 4) {
           // FIXME: Make the type of the result buffer correct instead of
@@ -1872,6 +1930,14 @@
   assert(SpellingPtr[0] == '"' && "Should be a string literal!");
   ++SpellingPtr;
 
+  ConversionState State = TranslationState;
+  if (Kind == tok::wide_string_literal)
+    State = TranslateToSystemCharset;
+  else if (isUTFLiteral(Kind))
+    State = NoTranslation;
+  llvm::CharSetConverter *Converter =
+      LT ? LT->getCharConversionTable(State) : nullptr;
+
   // Skip over bytes until we find the offset we're looking for.
   while (ByteNo) {
     assert(SpellingPtr < SpellingEnd && "Didn't find byte offset!");
@@ -1897,8 +1963,8 @@
       ByteNo -= Len;
     } else {
       ProcessCharEscape(SpellingStart, SpellingPtr, SpellingEnd, HadError,
-                        FullSourceLoc(Tok.getLocation(), SM),
-                        CharByteWidth*8, Diags, Features);
+                        FullSourceLoc(Tok.getLocation(), SM), CharByteWidth * 8,
+                        Diags, Features, Converter);
       --ByteNo;
     }
     assert(!HadError && "This method isn't valid on erroneous strings");
Index: clang/lib/Lex/CMakeLists.txt
===================================================================
--- clang/lib/Lex/CMakeLists.txt
+++ clang/lib/Lex/CMakeLists.txt
@@ -8,6 +8,7 @@
   HeaderSearch.cpp
   Lexer.cpp
   LiteralSupport.cpp
+  LiteralTranslator.cpp
   MacroArgs.cpp
   MacroInfo.cpp
   ModuleMap.cpp
Index: clang/lib/Frontend/CompilerInvocation.cpp
===================================================================
--- clang/lib/Frontend/CompilerInvocation.cpp
+++ clang/lib/Frontend/CompilerInvocation.cpp
@@ -3567,6 +3567,11 @@
       Args.hasFlag(OPT_fexperimental_relative_cxx_abi_vtables,
                    OPT_fno_experimental_relative_cxx_abi_vtables,
                    /*default=*/false);
+
+  if (Arg *ExecCharset = Args.getLastArg(OPT_fexec_charset)) {
+    StringRef Value = ExecCharset->getValue();
+    Opts.ExecCharset = (std::string)Value;
+  }
 }
 
 static bool isStrictlyPreprocessorAction(frontend::ActionKind Action) {
Index: clang/lib/Frontend/CompilerInstance.cpp
===================================================================
--- clang/lib/Frontend/CompilerInstance.cpp
+++ clang/lib/Frontend/CompilerInstance.cpp
@@ -12,6 +12,7 @@
 #include "clang/AST/Decl.h"
 #include "clang/Basic/CharInfo.h"
 #include "clang/Basic/Diagnostic.h"
+#include "clang/Basic/DiagnosticDriver.h"
 #include "clang/Basic/FileManager.h"
 #include "clang/Basic/LangStandard.h"
 #include "clang/Basic/SourceManager.h"
@@ -29,6 +30,7 @@
 #include "clang/Frontend/Utils.h"
 #include "clang/Frontend/VerifyDiagnosticConsumer.h"
 #include "clang/Lex/HeaderSearch.h"
+#include "clang/Lex/LiteralTranslator.h"
 #include "clang/Lex/Preprocessor.h"
 #include "clang/Lex/PreprocessorOptions.h"
 #include "clang/Sema/CodeCompleteConsumer.h"
@@ -475,6 +477,8 @@
                            /*ShowAllHeaders=*/true, /*OutputPath=*/"",
                            /*ShowDepth=*/true, /*MSStyle=*/true);
   }
+  PP->getLiteralTranslator()->setTranslationTables(getLangOpts(), getTarget(),
+                                                   getDiagnostics());
 }
 
 std::string CompilerInstance::getSpecificModuleCachePath() {
Index: clang/lib/Driver/ToolChains/Clang.cpp
===================================================================
--- clang/lib/Driver/ToolChains/Clang.cpp
+++ clang/lib/Driver/ToolChains/Clang.cpp
@@ -5966,12 +5966,15 @@
                                           << value;
   }
 
-  // -fexec_charset=UTF-8 is default. Reject others
-  if (Arg *execCharset = Args.getLastArg(options::OPT_fexec_charset_EQ)) {
-    StringRef value = execCharset->getValue();
-    if (!value.equals_lower("utf-8"))
-      D.Diag(diag::err_drv_invalid_value) << execCharset->getAsString(Args)
-                                          << value;
+  // Pass all -fexec-charset options to cc1.
+  std::vector<std::string> vList =
+      Args.getAllArgValues(options::OPT_fexec_charset_EQ);
+  // Set the default fexec-charset as the system charset.
+  CmdArgs.push_back("-fexec-charset");
+  CmdArgs.push_back(Args.MakeArgString(Triple.getSystemCharset()));
+  for (auto it = vList.begin(), ie = vList.end(); it != ie; ++it) {
+    CmdArgs.push_back("-fexec-charset");
+    CmdArgs.push_back(Args.MakeArgString(*it));
   }
 
   RenderDiagnosticsOptions(D, Args, CmdArgs);
Index: clang/include/clang/Lex/Preprocessor.h
===================================================================
--- clang/include/clang/Lex/Preprocessor.h
+++ clang/include/clang/Lex/Preprocessor.h
@@ -23,6 +23,7 @@
 #include "clang/Basic/SourceManager.h"
 #include "clang/Basic/TokenKinds.h"
 #include "clang/Lex/Lexer.h"
+#include "clang/Lex/LiteralTranslator.h"
 #include "clang/Lex/MacroInfo.h"
 #include "clang/Lex/ModuleLoader.h"
 #include "clang/Lex/ModuleMap.h"
@@ -141,6 +142,7 @@
   std::unique_ptr<ScratchBuffer> ScratchBuf;
   HeaderSearch      &HeaderInfo;
   ModuleLoader      &TheModuleLoader;
+  LiteralTranslator *LT = nullptr;
 
   /// External source of macros.
   ExternalPreprocessorSource *ExternalSource;
@@ -931,6 +933,7 @@
   SelectorTable &getSelectorTable() { return Selectors; }
   Builtin::Context &getBuiltinInfo() { return *BuiltinInfo; }
   llvm::BumpPtrAllocator &getPreprocessorAllocator() { return BP; }
+  LiteralTranslator *getLiteralTranslator() { return LT; }
 
   void setExternalSource(ExternalPreprocessorSource *Source) {
     ExternalSource = Source;
Index: clang/include/clang/Lex/LiteralTranslator.h
===================================================================
--- /dev/null
+++ clang/include/clang/Lex/LiteralTranslator.h
@@ -0,0 +1,46 @@
+//===--- clang/Lex/LiteralTranslator.h - Translator for Literals -*- C++ -*-==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_LEX_LITERALTRANSLATOR_H
+#define LLVM_CLANG_LEX_LITERALTRANSLATOR_H
+
+#include "clang/Basic/Diagnostic.h"
+#include "clang/Basic/LangOptions.h"
+#include "clang/Basic/TargetInfo.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/CharSet.h"
+
+enum ConversionState {
+  NoTranslation,
+  TranslateToSystemCharset,
+  TranslateToExecCharset
+};
+
+enum CharsetTableStatusCode {
+  CharsetTableOk = 1,
+  InvalidCharsetTable,
+};
+
+class LiteralTranslator {
+public:
+  llvm::StringRef InternalCharset;
+  llvm::StringRef SystemCharset;
+  llvm::StringRef ExecCharset;
+  llvm::StringMap<llvm::CharSetConverter> ExecCharsetTables;
+
+  llvm::CharSetConverter *getConversionTable(const char *Codepage);
+  CharsetTableStatusCode findOrCreateExecCharsetTable(const char *To);
+  llvm::CharSetConverter *
+  getCharConversionTable(ConversionState TranslationState);
+  void setTranslationTables(const clang::LangOptions &Opts,
+                            const clang::TargetInfo &TInfo,
+                            clang::DiagnosticsEngine &Diags);
+};
+
+#endif
Index: clang/include/clang/Lex/LiteralSupport.h
===================================================================
--- clang/include/clang/Lex/LiteralSupport.h
+++ clang/include/clang/Lex/LiteralSupport.h
@@ -17,10 +17,12 @@
 #include "clang/Basic/CharInfo.h"
 #include "clang/Basic/LLVM.h"
 #include "clang/Basic/TokenKinds.h"
+#include "clang/Lex/LiteralTranslator.h"
 #include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Support/CharSet.h"
 #include "llvm/Support/DataTypes.h"
 
 namespace clang {
@@ -184,9 +186,10 @@
   SmallString<32> UDSuffixBuf;
   unsigned UDSuffixOffset;
 public:
-  CharLiteralParser(const char *begin, const char *end,
-                    SourceLocation Loc, Preprocessor &PP,
-                    tok::TokenKind kind);
+  CharLiteralParser(const char *begin, const char *end, SourceLocation Loc,
+                    Preprocessor &PP, tok::TokenKind kind,
+                    ConversionState translationState = TranslateToExecCharset);
+  ConversionState TranslationState;
 
   bool hadError() const { return HadError; }
   bool isAscii() const { return Kind == tok::char_constant; }
@@ -211,6 +214,7 @@
   const LangOptions &Features;
   const TargetInfo &Target;
   DiagnosticsEngine *Diags;
+  LiteralTranslator *LT;
 
   unsigned MaxTokenLength;
   unsigned SizeBound;
@@ -222,21 +226,25 @@
   unsigned UDSuffixToken;
   unsigned UDSuffixOffset;
 public:
-  StringLiteralParser(ArrayRef<Token> StringToks,
-                      Preprocessor &PP, bool Complain = true);
-  StringLiteralParser(ArrayRef<Token> StringToks,
-                      const SourceManager &sm, const LangOptions &features,
-                      const TargetInfo &target,
-                      DiagnosticsEngine *diags = nullptr)
-    : SM(sm), Features(features), Target(target), Diags(diags),
-      MaxTokenLength(0), SizeBound(0), CharByteWidth(0), Kind(tok::unknown),
-      ResultPtr(ResultBuf.data()), hadError(false), Pascal(false) {
+  StringLiteralParser(
+      ArrayRef<Token> StringToks, Preprocessor &PP, bool Complain = true,
+      ConversionState translationState = TranslateToExecCharset);
+  StringLiteralParser(ArrayRef<Token> StringToks, const SourceManager &sm,
+                      const LangOptions &features, const TargetInfo &target,
+                      DiagnosticsEngine *diags = nullptr,
+                      ConversionState translation = TranslateToExecCharset)
+      : SM(sm), Features(features), Target(target), Diags(diags),
+        MaxTokenLength(0), SizeBound(0), CharByteWidth(0), Kind(tok::unknown),
+        ResultPtr(ResultBuf.data()), hadError(false), Pascal(false),
+        TranslationState(translation) {
+    LT = new LiteralTranslator();
+    LT->setTranslationTables(Features, Target, *Diags);
     init(StringToks);
   }
 
-
   bool hadError;
   bool Pascal;
+  ConversionState TranslationState;
 
   StringRef GetString() const {
     return StringRef(ResultBuf.data(), GetStringLength());
Index: clang/include/clang/Driver/Options.td
===================================================================
--- clang/include/clang/Driver/Options.td
+++ clang/include/clang/Driver/Options.td
@@ -3580,6 +3580,8 @@
 
 let Flags = [CC1Option, CC1AsOption, NoDriverOption] in {
 
+def fexec_charset : Separate<["-"], "fexec-charset">, MetaVarName<"<charset>">,
+  HelpText<"Set the execution <charset> for string and character literals">;
 def target_cpu : Separate<["-"], "target-cpu">,
   HelpText<"Target a specific cpu type">;
 def tune_cpu : Separate<["-"], "tune-cpu">,
Index: clang/include/clang/Basic/TokenKinds.h
===================================================================
--- clang/include/clang/Basic/TokenKinds.h
+++ clang/include/clang/Basic/TokenKinds.h
@@ -90,6 +90,13 @@
          isStringLiteral(K) || K == tok::header_name;
 }
 
+/// Return true if this is a utf literal kind.
+inline bool isUTFLiteral(TokenKind K) {
+  return K == tok::utf8_char_constant || K == tok::utf8_string_literal ||
+         K == tok::utf16_char_constant || K == tok::utf16_string_literal ||
+         K == tok::utf32_char_constant || K == tok::utf32_string_literal;
+}
+
 /// Return true if this is any of tok::annot_* kinds.
 bool isAnnotation(TokenKind K);
 
Index: clang/include/clang/Basic/LangOptions.h
===================================================================
--- clang/include/clang/Basic/LangOptions.h
+++ clang/include/clang/Basic/LangOptions.h
@@ -303,6 +303,9 @@
   /// input is a header file (i.e. -x c-header).
   bool IsHeaderFile = false;
 
+  /// Name of the exec charset to convert the internal charset to.
+  std::string ExecCharset;
+
   LangOptions();
 
   // Define accessors/mutators for language options of enumeration type.
_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to