https://github.com/azhan92 updated https://github.com/llvm/llvm-project/pull/204233
>From 5698fb7f4167d504a56db66c34f10becb3e91a5f Mon Sep 17 00:00:00 2001 From: alisonzhang <[email protected]> Date: Tue, 23 Jun 2026 13:34:38 -0400 Subject: [PATCH 01/14] Add changes from finput-charset PR --- .../clang/Basic/DiagnosticCommonKinds.td | 3 + clang/include/clang/Basic/LangOptions.h | 3 + clang/include/clang/Basic/SourceManager.h | 6 ++ clang/lib/Basic/SourceManager.cpp | 94 ++++++++++++++----- 4 files changed, 85 insertions(+), 21 deletions(-) diff --git a/clang/include/clang/Basic/DiagnosticCommonKinds.td b/clang/include/clang/Basic/DiagnosticCommonKinds.td index f2ed2f4698b8d..8ebac3908b465 100644 --- a/clang/include/clang/Basic/DiagnosticCommonKinds.td +++ b/clang/include/clang/Basic/DiagnosticCommonKinds.td @@ -417,6 +417,9 @@ def note_file_sloc_usage : Note< "%plural{0:|: plus %2B (%human2B) for macro expansions}2">; def note_file_misc_sloc_usage : Note< "%0 additional files entered using a total of %1B (%human1B) of space">; +def warn_charset_conversion_failed : Warning< + "conversion from source encoding failed for '%0': %1; interpreting as IBM-1047">, + InGroup<DiagGroup<"charset-conversion-failed">>; // Modules def err_module_format_unhandled : Error< diff --git a/clang/include/clang/Basic/LangOptions.h b/clang/include/clang/Basic/LangOptions.h index 9af036156b1ad..31f34207707c8 100644 --- a/clang/include/clang/Basic/LangOptions.h +++ b/clang/include/clang/Basic/LangOptions.h @@ -618,6 +618,9 @@ class LangOptions : public LangOptionsBase { /// The allocation token mode. std::optional<llvm::AllocTokenMode> AllocTokenMode; + /// Name of the input encoding to convert to the internal encoding. + std::string InputEncoding; + LangOptions(); /// Set language defaults for the given input language and diff --git a/clang/include/clang/Basic/SourceManager.h b/clang/include/clang/Basic/SourceManager.h index 4217b8683da1e..395fcfc9f71e8 100644 --- a/clang/include/clang/Basic/SourceManager.h +++ b/clang/include/clang/Basic/SourceManager.h @@ -50,6 +50,7 @@ #include "llvm/Support/Allocator.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/MemoryBuffer.h" +#include "llvm/Support/TextEncoding.h" #include <cassert> #include <cstddef> #include <map> @@ -156,6 +157,11 @@ class alignas(8) ContentCache { /// FIXME: Remove this once OrigEntry is a FileEntryRef with a stable name. StringRef Filename; + /// Information on whether this is associated with a FileID for a file (as + /// opposed to a buffer) and, if so, what conversion (if any) was requested. + llvm::PointerIntPair<llvm::TextEncodingConverter *, 1u, bool> + FileIDConverterInfo; + /// A bump pointer allocated array of offsets for each source line. /// /// This is lazily computed. The lines are owned by the SourceManager diff --git a/clang/lib/Basic/SourceManager.cpp b/clang/lib/Basic/SourceManager.cpp index b6cc6ec9365f5..8b9ee14c476a7 100644 --- a/clang/lib/Basic/SourceManager.cpp +++ b/clang/lib/Basic/SourceManager.cpp @@ -31,6 +31,7 @@ #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MemoryBuffer.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Support/SmallVectorMemoryBuffer.h" #include <algorithm> #include <cassert> #include <cstddef> @@ -136,7 +137,51 @@ ContentCache::getBufferOrNone(DiagnosticsEngine &Diag, FileManager &FM, Buffer = std::move(*BufferOrError); - // Check that the file's size fits in an 'unsigned' (with room for a + // Unless this is a named pipe (in which case we can handle a mismatch), + // check that the file's size is the same as in the file entry (which may + // have come from a stat cache). + assert(Buffer->getBufferSize() >= (size_t)ContentsEntry->getSize()); + if (!ContentsEntry->isNamedPipe() && + Buffer->getBufferSize() < (size_t)ContentsEntry->getSize()) { + Diag.Report(Loc, diag::err_file_modified) << ContentsEntry->getName(); + + return std::nullopt; + } + + // Convert source from the input charset to UTF-8 if necessary. + llvm::TextEncodingConverter *Converter = FileIDConverterInfo.getPointer(); + if (Converter) { + StringRef OriginalBuf = Buffer->getBuffer(); + llvm::SmallString<0> UTF8Buf; + UTF8Buf.reserve(OriginalBuf.size() + 1); + + std::error_code EC = Converter->convert(OriginalBuf, UTF8Buf); + if (EC) { + // If conversion fails, emit a warning and fall back to interpreting the + // file as UTF-8 without conversion. + // + // This allows the compiler to accept system or third-party headers that + // are encoded in UTF-8 even if conversion to the option-specified input + // charset failed. + // + // Diagnostics already exist when files are not well-formed UTF-8. + // + // TODO: Add input byte offset information. + // + // TODO: Consider adjusting the message to omit the "interpreting as + // UTF-8" recovery description if the warning has been upgraded to an + // error. + Diag.Report(Loc, diag::warn_charset_conversion_failed) + << ContentsEntry->getName() << EC.message(); + } else { + // TODO: Reclaim memory if the buffer size exceeds the content. + auto NewBuf = std::make_unique<llvm::SmallVectorMemoryBuffer>( + std::move(UTF8Buf), Buffer->getBufferIdentifier()); + Buffer = std::move(NewBuf); + } + } + + // Check that the buffer's size fits in an 'unsigned' (with room for a // past-the-end value). This is deeply regrettable, but various parts of // Clang (including elsewhere in this file!) use 'unsigned' to represent file // offsets, line numbers, string literal lengths, and so on, and fail @@ -151,22 +196,15 @@ ContentCache::getBufferOrNone(DiagnosticsEngine &Diag, FileManager &FM, return std::nullopt; } - // Unless this is a named pipe (in which case we can handle a mismatch), - // check that the file's size is the same as in the file entry (which may - // have come from a stat cache). - // The buffer will always be larger than the file size on z/OS in the presence - // of characters outside the base character set. - assert(Buffer->getBufferSize() >= (size_t)ContentsEntry->getSize()); - if (!ContentsEntry->isNamedPipe() && - Buffer->getBufferSize() < (size_t)ContentsEntry->getSize()) { - Diag.Report(Loc, diag::err_file_modified) << ContentsEntry->getName(); - - return std::nullopt; - } - - // If the buffer is valid, check to see if it has a UTF Byte Order Mark - // (BOM). We only support UTF-8 with and without a BOM right now. See - // http://en.wikipedia.org/wiki/Byte_order_mark for more information. + // If the buffer is valid, check to see if it has a UTF Byte Order Mark (BOM) + // Note that any conversion requested using `-finput-charset` (if successful) + // has already occurred, so we are expecting UTF-8 with or without a BOM. + // + // In theory, if we see a non-UTF-8 BOM, we can assume that an appropriate + // conversion was not supplied via `-finput-charset` and we could try to + // convert based on the BOM. + // + // See http://en.wikipedia.org/wiki/Byte_order_mark for more information. StringRef BufStr = Buffer->getBuffer(); const char *InvalidBOM = getInvalidBOM(BufStr); @@ -537,15 +575,29 @@ FileID SourceManager::getNextFileID(FileID FID) const { /// being \#included from the specified IncludePosition. FileID SourceManager::createFileID(FileEntryRef SourceFile, SourceLocation IncludePos, - SrcMgr::CharacteristicKind FileCharacter, + SrcMgr::CharacteristicKind FileCharacter, int LoadedID, SourceLocation::UIntTy LoadedOffset) { SrcMgr::ContentCache &IR = getOrCreateContentCache(SourceFile, isSystem(FileCharacter)); + #ifndef NDEBUG + // Either the content cache has never been used for a FileID (and, if we are + // being asked to use a converter, there should be no valid buffer set up for + // it) or the conversion (or lack thereof) should be the same as that used + // previously. + auto [CacheConverter, CacheUsedByFileID] = IR.FileIDConverterInfo; + if (CacheUsedByFileID) + assert(CacheConverter == Converter); + else + assert(!Converter || IR.IsBufferInvalid || !IR.getBufferIfLoaded()); +#endif + IR.FileIDConverterInfo.setPointerAndInt(Converter, true); + // If this is a named pipe, immediately load the buffer to ensure subsequent // calls to ContentCache::getSize() are accurate. - if (IR.ContentsEntry->isNamedPipe()) + // Do the same if character-encoding conversion was requested. + if (IR.ContentsEntry->isNamedPipe() || Converter) (void)IR.getBufferOrNone(Diag, getFileManager(), SourceLocation()); return createFileIDImpl(IR, SourceFile.getName(), IncludePos, FileCharacter, @@ -585,8 +637,8 @@ FileID SourceManager::getOrCreateFileID(FileEntryRef SourceFile, SrcMgr::CharacteristicKind FileCharacter) { FileID ID = translateFile(SourceFile); - return ID.isValid() ? ID : createFileID(SourceFile, SourceLocation(), - FileCharacter); + return ID.isValid() ? ID + : createFileID(SourceFile, SourceLocation(), FileCharacter); } /// createFileID - Create a new FileID for the specified ContentCache and >From 0991f6ebe08d9b3d4b5d12915fe531f1efce0974 Mon Sep 17 00:00:00 2001 From: alisonzhang <[email protected]> Date: Tue, 23 Jun 2026 13:34:54 -0400 Subject: [PATCH 02/14] getEncodingNameFromFileTag --- llvm/include/llvm/Support/AutoConvert.h | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/llvm/include/llvm/Support/AutoConvert.h b/llvm/include/llvm/Support/AutoConvert.h index d68b0e8b515e0..8797664b1b337 100644 --- a/llvm/include/llvm/Support/AutoConvert.h +++ b/llvm/include/llvm/Support/AutoConvert.h @@ -105,6 +105,31 @@ inline ErrorOr<bool> needConversion(const Twine &FileName, const int FD = -1) { return false; } +inline ErrorOr<SmallString<32>> +getEncodingNameFromFileTag(const Twine &FileName, const int FD = -1) { +#ifdef __MVS__ + ErrorOr<__ccsid_t> TagOrErr = getzOSFileTag(FileName, FD); + if (!TagOrErr) + return TagOrErr.getError(); + + __ccsid_t Tag = *TagOrErr; + if (Tag == 0) + return {}; // Return empty string for no tag + + if (Tag == 1208) + return {"utf-8"}; + + if (Tag == 1047) + return {"ibm-1047"}; + + SmallString<32> Result; + raw_svector_ostream(Result) << Tag; + return Result; +#else + return {}; // Return empty string for non-MVS platforms +#endif +} + } /* namespace llvm */ #endif /* __cplusplus */ >From 56a351c4495e0447618b75934a253dc173f29ced Mon Sep 17 00:00:00 2001 From: alisonzhang <[email protected]> Date: Tue, 23 Jun 2026 13:35:07 -0400 Subject: [PATCH 03/14] Add text encoding cache --- clang/include/clang/Basic/SourceManager.h | 14 ++++++++++ clang/lib/Basic/SourceManager.cpp | 34 +++++++++++++++++++++++ 2 files changed, 48 insertions(+) diff --git a/clang/include/clang/Basic/SourceManager.h b/clang/include/clang/Basic/SourceManager.h index 395fcfc9f71e8..4c90e4a538c52 100644 --- a/clang/include/clang/Basic/SourceManager.h +++ b/clang/include/clang/Basic/SourceManager.h @@ -846,6 +846,11 @@ class SourceManager : public RefCountedBase<SourceManager> { /// we can add a cc1-level option to do so. SmallVector<std::pair<std::string, FullSourceLoc>, 2> StoredModuleBuildStack; + /// Cache of all text encoding converters used by this SourceManager. + /// This includes both the input charset converter and file tag converters. + /// Maps from "source_encoding:target_encoding" to the converter. + llvm::StringMap<std::unique_ptr<llvm::TextEncodingConverter>> ConverterCache; + public: SourceManager(DiagnosticsEngine &Diag, FileManager &FileMgr, bool UserFilesAreVolatile = false); @@ -863,6 +868,15 @@ class SourceManager : public RefCountedBase<SourceManager> { FileManager &getFileManager() const { return FileMgr; } + /// Get or create a text encoding converter from the cache. + /// This method manages all converters (input charset and file tag converters) + /// in a single cache owned by SourceManager. + /// \param SourceEncoding the source character encoding name + /// \return pointer to the converter or an error code + /// The target encoding is always UTF-8. + llvm::ErrorOr<llvm::TextEncodingConverter *> + getOrCreateConverter(llvm::StringRef SourceEncoding); + /// Set true if the SourceManager should report the original file name /// for contents of files that were overridden by other files. Defaults to /// true. diff --git a/clang/lib/Basic/SourceManager.cpp b/clang/lib/Basic/SourceManager.cpp index 8b9ee14c476a7..46a7b8b85e2dd 100644 --- a/clang/lib/Basic/SourceManager.cpp +++ b/clang/lib/Basic/SourceManager.cpp @@ -467,6 +467,40 @@ ContentCache &SourceManager::createMemBufferContentCache( return *Entry; } +llvm::ErrorOr<llvm::TextEncodingConverter *> +SourceManager::getOrCreateConverter(llvm::StringRef SourceEncoding) { + // Use getKnownEncoding to get normalized encoding names + std::optional<llvm::TextEncoding> SourceKnown = + llvm::TextEncodingConverter::getKnownEncoding(SourceEncoding); + + if (SourceKnown && *SourceKnown == llvm::TextEncoding::UTF8) + return nullptr; + + // Create a cache key using canonical encoding name + llvm::StringRef CacheKey = SourceKnown + ? llvm::TextEncodingConverter::getKnownEncodingName(*SourceKnown) + : SourceEncoding; + + // Check if converter already exists in cache + auto It = ConverterCache.find(CacheKey); + if (It != ConverterCache.end()) + return It->second.get(); + + // Create a new converter + llvm::ErrorOr<llvm::TextEncodingConverter> NewConverter = + llvm::TextEncodingConverter::create(SourceEncoding, "UTF-8"); + + if (!NewConverter) + return NewConverter.getError(); + + // Store the converter in the cache + auto Inserted = ConverterCache.insert( + std::make_pair(CacheKey, std::make_unique<llvm::TextEncodingConverter>( + std::move(*NewConverter)))); + + return Inserted.first->second.get(); +} + const SrcMgr::SLocEntry &SourceManager::loadSLocEntry(unsigned Index, bool *Invalid) const { return const_cast<SourceManager *>(this)->loadSLocEntry(Index, Invalid); >From 46220a9d135a11bc0c36171cb8fad0e0cd654264 Mon Sep 17 00:00:00 2001 From: alisonzhang <[email protected]> Date: Tue, 23 Jun 2026 14:06:59 -0400 Subject: [PATCH 04/14] Get canonical names for encodings --- llvm/include/llvm/Support/TextEncoding.h | 11 +++++++++++ llvm/lib/Support/TextEncoding.cpp | 13 ++++++++++++- 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/llvm/include/llvm/Support/TextEncoding.h b/llvm/include/llvm/Support/TextEncoding.h index 8a304910aa5dd..09e24000594db 100644 --- a/llvm/include/llvm/Support/TextEncoding.h +++ b/llvm/include/llvm/Support/TextEncoding.h @@ -105,6 +105,17 @@ class TextEncodingConverter { LLVM_ABI static ErrorOr<TextEncodingConverter> create(StringRef From, StringRef To); + /// Maps the encoding name to enum constant if possible. + /// Uses normalized charset name matching. + /// \param[in] Name the character encoding name + /// \return the TextEncoding enum value if known, std::nullopt otherwise + LLVM_ABI static std::optional<TextEncoding> getKnownEncoding(StringRef Name); + + /// Returns the canonical name for a known encoding. + /// \param[in] Encoding the TextEncoding enum value + /// \return the canonical name for the encoding (e.g., "UTF-8" or "IBM-1047") + LLVM_ABI static StringRef getKnownEncodingName(TextEncoding Encoding); + TextEncodingConverter(const TextEncodingConverter &) = delete; TextEncodingConverter &operator=(const TextEncodingConverter &) = delete; diff --git a/llvm/lib/Support/TextEncoding.cpp b/llvm/lib/Support/TextEncoding.cpp index d36f02c1300b9..8e9653dab38ec 100644 --- a/llvm/lib/Support/TextEncoding.cpp +++ b/llvm/lib/Support/TextEncoding.cpp @@ -48,7 +48,7 @@ static void normalizeCharSetName(StringRef CSName, } // Maps the encoding name to enum constant if possible. -static std::optional<TextEncoding> getKnownEncoding(StringRef Name) { +std::optional<TextEncoding> TextEncodingConverter::getKnownEncoding(StringRef Name) { SmallString<16> Normalized; normalizeCharSetName(Name, Normalized); if (Normalized.equals("utf8")) @@ -58,6 +58,17 @@ static std::optional<TextEncoding> getKnownEncoding(StringRef Name) { return std::nullopt; } +// Returns the canonical name for a known encoding. +StringRef TextEncodingConverter::getKnownEncodingName(TextEncoding Encoding) { + switch (Encoding) { + case TextEncoding::UTF8: + return "UTF-8"; + case TextEncoding::IBM1047: + return "IBM-1047"; + } + llvm_unreachable("Invalid TextEncoding value"); +} + [[maybe_unused]] static void HandleOverflow(size_t &Capacity, char *&Output, size_t &OutputLength, SmallVectorImpl<char> &Result) { >From 24c1fd677f48da1127e292053e02b92971158a14 Mon Sep 17 00:00:00 2001 From: alisonzhang <[email protected]> Date: Mon, 22 Jun 2026 11:17:29 -0400 Subject: [PATCH 05/14] File mismatch checking --- clang/lib/Basic/FileManager.cpp | 17 ++++++++++++----- llvm/include/llvm/Support/VirtualFileSystem.h | 12 ++++++++++++ llvm/lib/Support/VirtualFileSystem.cpp | 13 ++++++++++--- 3 files changed, 34 insertions(+), 8 deletions(-) diff --git a/clang/lib/Basic/FileManager.cpp b/clang/lib/Basic/FileManager.cpp index 8fb3ba0a27aad..717a692661588 100644 --- a/clang/lib/Basic/FileManager.cpp +++ b/clang/lib/Basic/FileManager.cpp @@ -539,15 +539,22 @@ FileManager::getBufferForFile(FileEntryRef FE, bool isVolatile, FileSize = -1; StringRef Filename = FE.getName(); - // If the file is already open, use the open file descriptor. + // If the file is already open, check if the mode matches. if (Entry->File) { - auto Result = Entry->File->getBuffer(Filename, FileSize, - RequiresNullTerminator, isVolatile); + // Check if the cached file's mode matches the requested mode + // Only perform mismatch recovery for real files + if (!Entry->File->realFileTextMismatch(IsText)) { + // Mode matches, use the cached file descriptor + auto Result = Entry->File->getBuffer(Filename, FileSize, + RequiresNullTerminator, isVolatile); + Entry->closeFile(); + return Result; + } + // Mode mismatch - close the cached file and reopen with correct mode Entry->closeFile(); - return Result; } - // Otherwise, open the file. + // Open the file with the requested mode. return getBufferForFileImpl(Filename, FileSize, isVolatile, RequiresNullTerminator, IsText); } diff --git a/llvm/include/llvm/Support/VirtualFileSystem.h b/llvm/include/llvm/Support/VirtualFileSystem.h index d22c534228331..a3ef38fe552a7 100644 --- a/llvm/include/llvm/Support/VirtualFileSystem.h +++ b/llvm/include/llvm/Support/VirtualFileSystem.h @@ -137,6 +137,18 @@ class LLVM_ABI File { /// Closes the file. virtual std::error_code close() = 0; + /// Returns true if this file was opened in text mode (with potential + /// encoding conversions), false if opened in binary mode. + /// Default implementation returns true for backward compatibility. + virtual bool isText() const { return true; } + + /// Returns true if this is a real file and the requested text mode differs + /// from the current mode. Always returns false for non-real files. + /// Default implementation returns false for non-real files. + virtual bool realFileTextMismatch(bool RequestedIsText) const { + return false; + } + // Get the same file with a different path. static ErrorOr<std::unique_ptr<File>> getWithPath(ErrorOr<std::unique_ptr<File>> Result, const Twine &P); diff --git a/llvm/lib/Support/VirtualFileSystem.cpp b/llvm/lib/Support/VirtualFileSystem.cpp index 42e8bb4f9958e..2def668e63cb3 100644 --- a/llvm/lib/Support/VirtualFileSystem.cpp +++ b/llvm/lib/Support/VirtualFileSystem.cpp @@ -194,11 +194,13 @@ class RealFile : public File { file_t FD; Status S; std::string RealName; + bool IsTextMode; - RealFile(file_t RawFD, StringRef NewName, StringRef NewRealPathName) + RealFile(file_t RawFD, StringRef NewName, StringRef NewRealPathName, + bool IsText) : FD(RawFD), S(NewName, {}, {}, {}, {}, {}, llvm::sys::fs::file_type::status_error, {}), - RealName(NewRealPathName.str()) { + RealName(NewRealPathName.str()), IsTextMode(IsText) { assert(FD != kInvalidFile && "Invalid or inactive file descriptor"); } @@ -213,6 +215,10 @@ class RealFile : public File { bool IsVolatile) override; std::error_code close() override; void setPath(const Twine &Path) override; + bool isText() const override { return IsTextMode; } + bool realFileTextMismatch(bool RequestedIsText) const override { + return IsTextMode != RequestedIsText; + } }; } // namespace @@ -320,8 +326,9 @@ class RealFileSystem : public FileSystem { adjustPath(Name, Storage), Flags, &RealName); if (!FDOrErr) return errorToErrorCode(FDOrErr.takeError()); + bool IsText = (Flags & sys::fs::OF_Text) != sys::fs::OF_None; return std::unique_ptr<File>( - new RealFile(*FDOrErr, Name.str(), RealName.str())); + new RealFile(*FDOrErr, Name.str(), RealName.str(), IsText)); } struct WorkingDirectory { >From a33e9ee56ea4a99e8a9420f6097443d762ce6240 Mon Sep 17 00:00:00 2001 From: alisonzhang <[email protected]> Date: Mon, 22 Jun 2026 17:04:01 -0400 Subject: [PATCH 06/14] Remove isText function --- llvm/include/llvm/Support/VirtualFileSystem.h | 5 ----- llvm/lib/Support/VirtualFileSystem.cpp | 1 - 2 files changed, 6 deletions(-) diff --git a/llvm/include/llvm/Support/VirtualFileSystem.h b/llvm/include/llvm/Support/VirtualFileSystem.h index a3ef38fe552a7..8e5ffc0f051a6 100644 --- a/llvm/include/llvm/Support/VirtualFileSystem.h +++ b/llvm/include/llvm/Support/VirtualFileSystem.h @@ -137,11 +137,6 @@ class LLVM_ABI File { /// Closes the file. virtual std::error_code close() = 0; - /// Returns true if this file was opened in text mode (with potential - /// encoding conversions), false if opened in binary mode. - /// Default implementation returns true for backward compatibility. - virtual bool isText() const { return true; } - /// Returns true if this is a real file and the requested text mode differs /// from the current mode. Always returns false for non-real files. /// Default implementation returns false for non-real files. diff --git a/llvm/lib/Support/VirtualFileSystem.cpp b/llvm/lib/Support/VirtualFileSystem.cpp index 2def668e63cb3..a46abbef127b4 100644 --- a/llvm/lib/Support/VirtualFileSystem.cpp +++ b/llvm/lib/Support/VirtualFileSystem.cpp @@ -215,7 +215,6 @@ class RealFile : public File { bool IsVolatile) override; std::error_code close() override; void setPath(const Twine &Path) override; - bool isText() const override { return IsTextMode; } bool realFileTextMismatch(bool RequestedIsText) const override { return IsTextMode != RequestedIsText; } >From 901c8ee08a62522622578029fc1dbf012ab42ce5 Mon Sep 17 00:00:00 2001 From: alisonzhang <[email protected]> Date: Tue, 23 Jun 2026 15:14:07 -0400 Subject: [PATCH 07/14] Bob changes --- clang/lib/Basic/FileManager.cpp | 2 +- llvm/include/llvm/Support/VirtualFileSystem.h | 8 +++++--- llvm/lib/Support/VirtualFileSystem.cpp | 16 +++++++++++++--- 3 files changed, 19 insertions(+), 7 deletions(-) diff --git a/clang/lib/Basic/FileManager.cpp b/clang/lib/Basic/FileManager.cpp index 717a692661588..dd0f7dc0a7053 100644 --- a/clang/lib/Basic/FileManager.cpp +++ b/clang/lib/Basic/FileManager.cpp @@ -543,7 +543,7 @@ FileManager::getBufferForFile(FileEntryRef FE, bool isVolatile, if (Entry->File) { // Check if the cached file's mode matches the requested mode // Only perform mismatch recovery for real files - if (!Entry->File->realFileTextMismatch(IsText)) { + if (!Entry->File->checkTextModeMismatch(IsText)) { // Mode matches, use the cached file descriptor auto Result = Entry->File->getBuffer(Filename, FileSize, RequiresNullTerminator, isVolatile); diff --git a/llvm/include/llvm/Support/VirtualFileSystem.h b/llvm/include/llvm/Support/VirtualFileSystem.h index 8e5ffc0f051a6..2bdf97e3721fc 100644 --- a/llvm/include/llvm/Support/VirtualFileSystem.h +++ b/llvm/include/llvm/Support/VirtualFileSystem.h @@ -137,10 +137,12 @@ class LLVM_ABI File { /// Closes the file. virtual std::error_code close() = 0; - /// Returns true if this is a real file and the requested text mode differs - /// from the current mode. Always returns false for non-real files. + /// Checks if this is a real file and the requested text mode differs + /// from the current mode. For real files with a text mode mismatch where + /// the buffer was previously requested, this will call llvm::report_fatal_error. + /// Always returns false for non-real files. /// Default implementation returns false for non-real files. - virtual bool realFileTextMismatch(bool RequestedIsText) const { + virtual bool checkTextModeMismatch(bool RequestedIsText) const { return false; } diff --git a/llvm/lib/Support/VirtualFileSystem.cpp b/llvm/lib/Support/VirtualFileSystem.cpp index a46abbef127b4..743edb336a19f 100644 --- a/llvm/lib/Support/VirtualFileSystem.cpp +++ b/llvm/lib/Support/VirtualFileSystem.cpp @@ -195,12 +195,14 @@ class RealFile : public File { Status S; std::string RealName; bool IsTextMode; + bool BufferWasRequested; RealFile(file_t RawFD, StringRef NewName, StringRef NewRealPathName, bool IsText) : FD(RawFD), S(NewName, {}, {}, {}, {}, {}, llvm::sys::fs::file_type::status_error, {}), - RealName(NewRealPathName.str()), IsTextMode(IsText) { + RealName(NewRealPathName.str()), IsTextMode(IsText), + BufferWasRequested(false) { assert(FD != kInvalidFile && "Invalid or inactive file descriptor"); } @@ -215,8 +217,15 @@ class RealFile : public File { bool IsVolatile) override; std::error_code close() override; void setPath(const Twine &Path) override; - bool realFileTextMismatch(bool RequestedIsText) const override { - return IsTextMode != RequestedIsText; + bool checkTextModeMismatch(bool RequestedIsText) const override { + bool HasMismatch = IsTextMode != RequestedIsText; + if (HasMismatch && BufferWasRequested) { + llvm::report_fatal_error( + "Text mode mismatch: file was previously opened with " + + Twine(IsTextMode ? "text" : "binary") + " mode, now requested with " + + Twine(RequestedIsText ? "text" : "binary") + " mode"); + } + return HasMismatch; } }; @@ -247,6 +256,7 @@ RealFile::getBuffer(const Twine &Name, int64_t FileSize, auto BypassSandbox = sys::sandbox::scopedDisable(); assert(FD != kInvalidFile && "cannot get buffer for closed file"); + BufferWasRequested = true; return MemoryBuffer::getOpenFile(FD, Name, FileSize, RequiresNullTerminator, IsVolatile); } >From ca78018eb1dc2e7be521809c1aa32148cbe65df5 Mon Sep 17 00:00:00 2001 From: alisonzhang <[email protected]> Date: Tue, 23 Jun 2026 15:24:28 -0400 Subject: [PATCH 08/14] update name --- clang/lib/Basic/FileManager.cpp | 2 +- llvm/include/llvm/Support/VirtualFileSystem.h | 2 +- llvm/lib/Support/VirtualFileSystem.cpp | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/clang/lib/Basic/FileManager.cpp b/clang/lib/Basic/FileManager.cpp index dd0f7dc0a7053..94fc4c15f51d2 100644 --- a/clang/lib/Basic/FileManager.cpp +++ b/clang/lib/Basic/FileManager.cpp @@ -543,7 +543,7 @@ FileManager::getBufferForFile(FileEntryRef FE, bool isVolatile, if (Entry->File) { // Check if the cached file's mode matches the requested mode // Only perform mismatch recovery for real files - if (!Entry->File->checkTextModeMismatch(IsText)) { + if (!Entry->File->realFileCheckTextModeMismatch(IsText)) { // Mode matches, use the cached file descriptor auto Result = Entry->File->getBuffer(Filename, FileSize, RequiresNullTerminator, isVolatile); diff --git a/llvm/include/llvm/Support/VirtualFileSystem.h b/llvm/include/llvm/Support/VirtualFileSystem.h index 2bdf97e3721fc..9cc35b6f5fb6d 100644 --- a/llvm/include/llvm/Support/VirtualFileSystem.h +++ b/llvm/include/llvm/Support/VirtualFileSystem.h @@ -142,7 +142,7 @@ class LLVM_ABI File { /// the buffer was previously requested, this will call llvm::report_fatal_error. /// Always returns false for non-real files. /// Default implementation returns false for non-real files. - virtual bool checkTextModeMismatch(bool RequestedIsText) const { + virtual bool realFileCheckTextModeMismatch(bool RequestedIsText) const { return false; } diff --git a/llvm/lib/Support/VirtualFileSystem.cpp b/llvm/lib/Support/VirtualFileSystem.cpp index 743edb336a19f..1c1f9e9dd15b6 100644 --- a/llvm/lib/Support/VirtualFileSystem.cpp +++ b/llvm/lib/Support/VirtualFileSystem.cpp @@ -217,7 +217,7 @@ class RealFile : public File { bool IsVolatile) override; std::error_code close() override; void setPath(const Twine &Path) override; - bool checkTextModeMismatch(bool RequestedIsText) const override { + bool realFileCheckTextModeMismatch(bool RequestedIsText) const override { bool HasMismatch = IsTextMode != RequestedIsText; if (HasMismatch && BufferWasRequested) { llvm::report_fatal_error( >From 54f2ba7aba3fbe0a9cd52315762209393f1e5857 Mon Sep 17 00:00:00 2001 From: alisonzhang <[email protected]> Date: Tue, 23 Jun 2026 16:02:43 -0400 Subject: [PATCH 09/14] Mode mismatch checking --- clang/lib/Basic/FileManager.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/clang/lib/Basic/FileManager.cpp b/clang/lib/Basic/FileManager.cpp index 94fc4c15f51d2..edaeea08b20c1 100644 --- a/clang/lib/Basic/FileManager.cpp +++ b/clang/lib/Basic/FileManager.cpp @@ -550,7 +550,8 @@ FileManager::getBufferForFile(FileEntryRef FE, bool isVolatile, Entry->closeFile(); return Result; } - // Mode mismatch - close the cached file and reopen with correct mode + // Mode mismatch - close the cached file and reopen with correct mode by + // falling through. Entry->closeFile(); } >From 5f867136428c01cc115884044f6e3ae90fe53788 Mon Sep 17 00:00:00 2001 From: alisonzhang <[email protected]> Date: Tue, 23 Jun 2026 16:07:07 -0400 Subject: [PATCH 10/14] Create converters in createFileID --- clang/include/clang/Basic/SourceManager.h | 4 +- .../include/clang/Frontend/CompilerInstance.h | 1 + clang/lib/Basic/SourceManager.cpp | 56 ++++++++++++++++--- clang/lib/Frontend/CompilerInstance.cpp | 10 +++- .../lib/Frontend/VerifyDiagnosticConsumer.cpp | 4 +- clang/lib/Lex/ModuleMap.cpp | 7 ++- clang/lib/Lex/PPDirectives.cpp | 6 +- clang/lib/Lex/Preprocessor.cpp | 4 +- clang/lib/Serialization/ASTReader.cpp | 5 +- 9 files changed, 81 insertions(+), 16 deletions(-) diff --git a/clang/include/clang/Basic/SourceManager.h b/clang/include/clang/Basic/SourceManager.h index 4c90e4a538c52..347dff62b2c38 100644 --- a/clang/include/clang/Basic/SourceManager.h +++ b/clang/include/clang/Basic/SourceManager.h @@ -938,6 +938,7 @@ class SourceManager : public RefCountedBase<SourceManager> { /// being \#included from the specified IncludePosition. FileID createFileID(FileEntryRef SourceFile, SourceLocation IncludePos, SrcMgr::CharacteristicKind FileCharacter, + llvm::StringRef InputEncodingName = {}, int LoadedID = 0, SourceLocation::UIntTy LoadedOffset = 0); @@ -962,7 +963,8 @@ class SourceManager : public RefCountedBase<SourceManager> { /// Get the FileID for \p SourceFile if it exists. Otherwise, create a /// new FileID for the \p SourceFile. FileID getOrCreateFileID(FileEntryRef SourceFile, - SrcMgr::CharacteristicKind FileCharacter); + SrcMgr::CharacteristicKind FileCharacter, + llvm::StringRef InputEncodingName = {}); /// Creates an expansion SLocEntry for the substitution of an argument into a /// function-like macro's body. Returns the start of the expansion. diff --git a/clang/include/clang/Frontend/CompilerInstance.h b/clang/include/clang/Frontend/CompilerInstance.h index bb0eddb918623..522fecfcab35e 100644 --- a/clang/include/clang/Frontend/CompilerInstance.h +++ b/clang/include/clang/Frontend/CompilerInstance.h @@ -864,6 +864,7 @@ class CompilerInstance : public ModuleLoader { /// /// \return True on success. static bool InitializeSourceManager(const FrontendInputFile &Input, + llvm::StringRef InputEncodingName = {}, DiagnosticsEngine &Diags, FileManager &FileMgr, SourceManager &SourceMgr); diff --git a/clang/lib/Basic/SourceManager.cpp b/clang/lib/Basic/SourceManager.cpp index 46a7b8b85e2dd..2e71d39dc232c 100644 --- a/clang/lib/Basic/SourceManager.cpp +++ b/clang/lib/Basic/SourceManager.cpp @@ -121,7 +121,16 @@ ContentCache::getBufferOrNone(DiagnosticsEngine &Diag, FileManager &FM, // return paths. IsBufferInvalid = true; - auto BufferOrError = FM.getBufferForFile(*ContentsEntry, IsFileVolatile); + // If a converter is set, open the file in binary mode to get raw bytes + // and avoid platform-specific auto-conversion (e.g., EBCDIC->ASCII on z/OS, + // CRLF->LF on Windows). The explicit converter will handle all transformations. + bool NeedsExplicitConversion = FileIDConverterInfo.getPointer() != nullptr; + bool IsText = !NeedsExplicitConversion; + + auto BufferOrError = FM.getBufferForFile(*ContentsEntry, IsFileVolatile, + /*RequiresNullTerminator=*/true, + /*MaybeLimit=*/std::nullopt, + IsText); // If we were unable to open the file, then we are in an inconsistent // situation where the content cache referenced a file which no longer @@ -610,23 +619,53 @@ FileID SourceManager::getNextFileID(FileID FID) const { FileID SourceManager::createFileID(FileEntryRef SourceFile, SourceLocation IncludePos, SrcMgr::CharacteristicKind FileCharacter, + llvm::StringRef InputEncodingName, int LoadedID, SourceLocation::UIntTy LoadedOffset) { SrcMgr::ContentCache &IR = getOrCreateContentCache(SourceFile, isSystem(FileCharacter)); + llvm::ErrorOr<llvm::TextEncodingConverter *> Converter = nullptr; + llvm::ErrorOr<llvm::SmallString<32>> Ccsid = + llvm::getEncodingNameFromFileTag(SourceFile.getName()); + if (!Ccsid) { + Diag.Report(SourceLocation(), diag::err_cannot_open_file) + << SourceFile.getName() << Ccsid.getError().message(); + return FileID(); + } + if (!Ccsid->empty()) { + // File has a tag, use the converter from SourceManager's cache + Converter = getOrCreateConverter(*Ccsid); + if (!Converter) { + Diag.Report(SourceLocation(), diag::err_cannot_open_file) + << SourceFile.getName() + << (llvm::Twine("cannot create converter from encoding '") + *Ccsid + "'"); + return FileID(); + } + } else if (!InputEncodingName.empty()) { + // No file tag but -finput-charset conversion is desired. + // Get the converter from the cache using the input encoding name. + Converter = getOrCreateConverter(InputEncodingName); + if (!Converter) { + llvm::report_fatal_error( + "Cannot create converter for file '" + SourceFile.getName() + "': " + + Converter.getError().message()); + } + } + #ifndef NDEBUG // Either the content cache has never been used for a FileID (and, if we are // being asked to use a converter, there should be no valid buffer set up for // it) or the conversion (or lack thereof) should be the same as that used // previously. auto [CacheConverter, CacheUsedByFileID] = IR.FileIDConverterInfo; + llvm::TextEncodingConverter *ConverterPtr = Converter ? *Converter : nullptr; if (CacheUsedByFileID) - assert(CacheConverter == Converter); + assert(CacheConverter == ConverterPtr); else - assert(!Converter || IR.IsBufferInvalid || !IR.getBufferIfLoaded()); + assert(!ConverterPtr || IR.IsBufferInvalid || !IR.getBufferIfLoaded()); #endif - IR.FileIDConverterInfo.setPointerAndInt(Converter, true); + IR.FileIDConverterInfo.setPointerAndInt(Converter ? *Converter : nullptr, true); // If this is a named pipe, immediately load the buffer to ensure subsequent // calls to ContentCache::getSize() are accurate. @@ -669,10 +708,12 @@ FileID SourceManager::createFileID(const llvm::MemoryBufferRef &Buffer, /// new FileID for the \p SourceFile. FileID SourceManager::getOrCreateFileID(FileEntryRef SourceFile, - SrcMgr::CharacteristicKind FileCharacter) { + SrcMgr::CharacteristicKind FileCharacter, + llvm::StringRef InputEncodingName) { FileID ID = translateFile(SourceFile); return ID.isValid() ? ID - : createFileID(SourceFile, SourceLocation(), FileCharacter); + : createFileID(SourceFile, SourceLocation(), FileCharacter, + InputEncodingName); } /// createFileID - Create a new FileID for the specified ContentCache and @@ -2427,7 +2468,8 @@ SourceManagerForFile::SourceManagerForFile(StringRef FileName, SourceMgr = std::make_unique<SourceManager>(*Diagnostics, *FileMgr); FileEntryRef FE = llvm::cantFail(FileMgr->getFileRef(FileName)); FileID ID = - SourceMgr->createFileID(FE, SourceLocation(), clang::SrcMgr::C_User); + SourceMgr->createFileID(FE, SourceLocation(), clang::SrcMgr::C_User, + /*InputEncodingName=*/{}); assert(ID.isValid()); SourceMgr->setMainFileID(ID); } diff --git a/clang/lib/Frontend/CompilerInstance.cpp b/clang/lib/Frontend/CompilerInstance.cpp index 8aee45b5dc644..42956fb18c104 100644 --- a/clang/lib/Frontend/CompilerInstance.cpp +++ b/clang/lib/Frontend/CompilerInstance.cpp @@ -912,12 +912,15 @@ CompilerInstance::createOutputFileImpl(StringRef OutputPath, bool Binary, // Initialization Utilities bool CompilerInstance::InitializeSourceManager(const FrontendInputFile &Input){ - return InitializeSourceManager(Input, getDiagnostics(), getFileManager(), - getSourceManager()); + StringRef InputEncodingName = + hasPreprocessor() ? llvm::StringRef(getLangOpts().InputEncoding) : llvm::StringRef() + return InitializeSourceManager(Input, InputEncodingName, getDiagnostics(), + getFileManager(), getSourceManager()); } // static bool CompilerInstance::InitializeSourceManager(const FrontendInputFile &Input, + llvm::StringRef InputEncodingName, DiagnosticsEngine &Diags, FileManager &FileMgr, SourceManager &SourceMgr) { @@ -950,7 +953,8 @@ bool CompilerInstance::InitializeSourceManager(const FrontendInputFile &Input, } SourceMgr.setMainFileID( - SourceMgr.createFileID(*FileOrErr, SourceLocation(), Kind)); + SourceMgr.createFileID(*FileOrErr, SourceLocation(), Kind, + InputEncodingName)); assert(SourceMgr.getMainFileID().isValid() && "Couldn't establish MainFileID!"); diff --git a/clang/lib/Frontend/VerifyDiagnosticConsumer.cpp b/clang/lib/Frontend/VerifyDiagnosticConsumer.cpp index 1bfe644b2525a..691bc5a5fd31d 100644 --- a/clang/lib/Frontend/VerifyDiagnosticConsumer.cpp +++ b/clang/lib/Frontend/VerifyDiagnosticConsumer.cpp @@ -610,8 +610,10 @@ static bool ParseDirective(StringRef S, ExpectedData *ED, SourceManager &SM, } FileID FID = SM.translateFile(*File); + // FIXME: Figure out character-encoding converter treatment. if (FID.isInvalid()) - FID = SM.createFileID(*File, Pos, SrcMgr::C_User); + FID = SM.createFileID(*File, Pos, SrcMgr::C_User, + /*InputEncodingName=*/{}); if (PH.Next(Line) && Line > 0) ExpectedLoc = SM.translateLineCol(FID, Line, 1); diff --git a/clang/lib/Lex/ModuleMap.cpp b/clang/lib/Lex/ModuleMap.cpp index 6c07386f89010..3e5c5b317d580 100644 --- a/clang/lib/Lex/ModuleMap.cpp +++ b/clang/lib/Lex/ModuleMap.cpp @@ -1473,7 +1473,12 @@ bool ModuleMap::parseModuleMapFile(FileEntryRef File, bool IsSystem, if (LocalFID.isInvalid()) { auto FileCharacter = IsSystem ? SrcMgr::C_System_ModuleMap : SrcMgr::C_User_ModuleMap; - LocalFID = SourceMgr.createFileID(File, ExternModuleLoc, FileCharacter); + // Module map files are textual "source files". Use input charset converter + // if available, and file tag converters are handled by SourceManager's cache. + // Get input encoding from LangOptions for charset conversion + llvm::StringRef InputEncodingName = LangOpts.InputEncoding; + LocalFID = SourceMgr.createFileID(File, ExternModuleLoc, FileCharacter, + InputEncodingName); } ID = LocalFID; } diff --git a/clang/lib/Lex/PPDirectives.cpp b/clang/lib/Lex/PPDirectives.cpp index eb21a510dcf83..f989c2d1d4b96 100644 --- a/clang/lib/Lex/PPDirectives.cpp +++ b/clang/lib/Lex/PPDirectives.cpp @@ -2796,7 +2796,11 @@ Preprocessor::ImportAction Preprocessor::HandleHeaderIncludeOrImport( // position on the file where it will be included and after the expansions. if (IncludePos.isMacroID()) IncludePos = SourceMgr.getExpansionRange(IncludePos).getEnd(); - FileID FID = SourceMgr.createFileID(*File, IncludePos, FileCharacter); + // Use the SourceManager's input charset converter for non-tagged files + // by passing the input encoding name + llvm::StringRef InputEncodingName = getLangOpts().InputEncoding; + FileID FID = SourceMgr.createFileID(*File, IncludePos, FileCharacter, + InputEncodingName); if (!FID.isValid()) { TheModuleLoader.HadFatalFailure = true; return ImportAction::Failure; diff --git a/clang/lib/Lex/Preprocessor.cpp b/clang/lib/Lex/Preprocessor.cpp index 1e21b4a94cea3..0b07f7de8675a 100644 --- a/clang/lib/Lex/Preprocessor.cpp +++ b/clang/lib/Lex/Preprocessor.cpp @@ -649,8 +649,10 @@ void Preprocessor::EnterMainSourceFile() { << PPOpts.PCHThroughHeader; return; } + // FIXME: Figure out character-encoding converter treatment. setPCHThroughHeaderFileID( - SourceMgr.createFileID(*File, SourceLocation(), SrcMgr::C_User)); + SourceMgr.createFileID(*File, SourceLocation(), SrcMgr::C_User, + /*InputEncodingName=*/{})); } // Skip tokens from the Predefines and if needed the main file. diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp index f8a6a38bb9b5c..52b60df62977d 100644 --- a/clang/lib/Serialization/ASTReader.cpp +++ b/clang/lib/Serialization/ASTReader.cpp @@ -2002,7 +2002,10 @@ bool ASTReader::ReadSLocEntry(int ID) { } SrcMgr::CharacteristicKind FileCharacter = (SrcMgr::CharacteristicKind)Record[2]; - FileID FID = SourceMgr.createFileID(*File, IncludeLoc, FileCharacter, ID, + // Note: If conversion was originally necessary, OverriddenBuffer should be + // true and the associated handling will trigger. + FileID FID = SourceMgr.createFileID(*File, IncludeLoc, FileCharacter, + /*InputEncodingName=*/{}, ID, BaseOffset + Record[0]); SrcMgr::FileInfo &FileInfo = SourceMgr.getSLocEntry(FID).getFile(); FileInfo.NumCreatedFIDs = Record[5]; >From ec7463b27999ce40a92abf767eecba70eac713ca Mon Sep 17 00:00:00 2001 From: alisonzhang <[email protected]> Date: Wed, 24 Jun 2026 21:35:09 -0400 Subject: [PATCH 11/14] fix fallback --- clang/CMakeLists.txt | 3 ++ .../clang/Basic/DiagnosticCommonKinds.td | 4 ++- clang/include/clang/Basic/SourceManager.h | 34 ++++++++++++++++++- clang/include/clang/Config/config.h.cmake | 3 ++ clang/lib/Basic/SourceManager.cpp | 33 +++++++++++++----- clang/lib/Driver/ToolChains/Clang.cpp | 15 +++++++- 6 files changed, 80 insertions(+), 12 deletions(-) diff --git a/clang/CMakeLists.txt b/clang/CMakeLists.txt index cd7ba53b03061..b73a7218c3e81 100644 --- a/clang/CMakeLists.txt +++ b/clang/CMakeLists.txt @@ -275,6 +275,9 @@ set(ENABLE_X86_RELAX_RELOCATIONS ON CACHE BOOL set(PPC_LINUX_DEFAULT_IEEELONGDOUBLE OFF CACHE BOOL "Enable IEEE binary128 as default long double format on PowerPC Linux.") +set(CLANG_DEFAULT_INPUT_ENCODING_IBM1047 OFF CACHE BOOL + "Set IBM-1047 as the default input encoding") + set(CLANG_SPAWN_CC1 OFF CACHE BOOL "Whether clang should use a new process for the CC1 invocation") diff --git a/clang/include/clang/Basic/DiagnosticCommonKinds.td b/clang/include/clang/Basic/DiagnosticCommonKinds.td index 8ebac3908b465..4dc958cad59ce 100644 --- a/clang/include/clang/Basic/DiagnosticCommonKinds.td +++ b/clang/include/clang/Basic/DiagnosticCommonKinds.td @@ -418,8 +418,10 @@ def note_file_sloc_usage : Note< def note_file_misc_sloc_usage : Note< "%0 additional files entered using a total of %1B (%human1B) of space">; def warn_charset_conversion_failed : Warning< - "conversion from source encoding failed for '%0': %1; interpreting as IBM-1047">, + "conversion from source encoding failed for '%0': %1; interpreting as %2">, InGroup<DiagGroup<"charset-conversion-failed">>; +def err_charset_conversion_failed : Error< + "conversion from source encoding failed for '%0': %1">; // Modules def err_module_format_unhandled : Error< diff --git a/clang/include/clang/Basic/SourceManager.h b/clang/include/clang/Basic/SourceManager.h index 347dff62b2c38..1c74d7d34d6f6 100644 --- a/clang/include/clang/Basic/SourceManager.h +++ b/clang/include/clang/Basic/SourceManager.h @@ -159,7 +159,9 @@ class alignas(8) ContentCache { /// Information on whether this is associated with a FileID for a file (as /// opposed to a buffer) and, if so, what conversion (if any) was requested. - llvm::PointerIntPair<llvm::TextEncodingConverter *, 1u, bool> + /// The integer part uses 2 bits: bit 0 indicates if used by FileID, + /// bit 1 indicates if the file was tagged. + llvm::PointerIntPair<llvm::TextEncodingConverter *, 2u, unsigned> FileIDConverterInfo; /// A bump pointer allocated array of offsets for each source line. @@ -277,6 +279,36 @@ class alignas(8) ContentCache { // If BufStr has an invalid BOM, returns the BOM name; otherwise, returns // nullptr + + /// Helper methods for FileIDConverterInfo bit manipulation. + /// Bit 0: Used by FileID flag + /// Bit 1: File tagged flag + + bool isUsedByFileID() const { + return FileIDConverterInfo.getInt() & 0x1; + } + + void setUsedByFileID(bool Used) { + unsigned Flags = FileIDConverterInfo.getInt(); + if (Used) + Flags |= 0x1; + else + Flags &= ~0x1; + FileIDConverterInfo.setInt(Flags); + } + + bool isFileTagged() const { + return FileIDConverterInfo.getInt() & 0x2; + } + + void setFileTagged(bool Tagged) { + unsigned Flags = FileIDConverterInfo.getInt(); + if (Tagged) + Flags |= 0x2; + else + Flags &= ~0x2; + FileIDConverterInfo.setInt(Flags); + } static const char *getInvalidBOM(StringRef BufStr); }; diff --git a/clang/include/clang/Config/config.h.cmake b/clang/include/clang/Config/config.h.cmake index 11b4096726f67..fbafafc710afe 100644 --- a/clang/include/clang/Config/config.h.cmake +++ b/clang/include/clang/Config/config.h.cmake @@ -75,6 +75,9 @@ /* Enable IEEE binary128 as default long double format on PowerPC Linux. */ #cmakedefine01 PPC_LINUX_DEFAULT_IEEELONGDOUBLE +/* Set IBM-1047 as the default input encoding */ +#cmakedefine01 CLANG_DEFAULT_INPUT_ENCODING_IBM1047 + /* Enable each functionality of modules */ #cmakedefine01 CLANG_ENABLE_OBJC_REWRITER #cmakedefine01 CLANG_ENABLE_STATIC_ANALYZER diff --git a/clang/lib/Basic/SourceManager.cpp b/clang/lib/Basic/SourceManager.cpp index 2e71d39dc232c..f8100aa7b2e32 100644 --- a/clang/lib/Basic/SourceManager.cpp +++ b/clang/lib/Basic/SourceManager.cpp @@ -166,22 +166,27 @@ ContentCache::getBufferOrNone(DiagnosticsEngine &Diag, FileManager &FM, std::error_code EC = Converter->convert(OriginalBuf, UTF8Buf); if (EC) { + // For tagged files, conversion failure is an error and we don't fall back + if (isFileTagged()) { + Diag.Report(Loc, diag::err_charset_conversion_failed) + << ContentsEntry->getName() << EC.message(); + return std::nullopt; + } + // If conversion fails, emit a warning and fall back to interpreting the - // file as UTF-8 without conversion. + // file as the default charset. // // This allows the compiler to accept system or third-party headers that - // are encoded in UTF-8 even if conversion to the option-specified input - // charset failed. - // - // Diagnostics already exist when files are not well-formed UTF-8. + // are encoded in the default charset even if conversion to the + // option-specified input charset failed. // // TODO: Add input byte offset information. // - // TODO: Consider adjusting the message to omit the "interpreting as - // UTF-8" recovery description if the warning has been upgraded to an - // error. + // TODO: Consider adjusting the message to omit the recovery description + // if the warning has been upgraded to an error. + const char *FallbackEncoding = CLANG_DEFAULT_INPUT_ENCODING_IBM1047 ? "IBM-1047" : "UTF-8"; Diag.Report(Loc, diag::warn_charset_conversion_failed) - << ContentsEntry->getName() << EC.message(); + << ContentsEntry->getName() << EC.message() << FallbackEncoding; } else { // TODO: Reclaim memory if the buffer size exceeds the content. auto NewBuf = std::make_unique<llvm::SmallVectorMemoryBuffer>( @@ -642,6 +647,7 @@ FileID SourceManager::createFileID(FileEntryRef SourceFile, << (llvm::Twine("cannot create converter from encoding '") + *Ccsid + "'"); return FileID(); } + IR.setFileTagged(true); } else if (!InputEncodingName.empty()) { // No file tag but -finput-charset conversion is desired. // Get the converter from the cache using the input encoding name. @@ -651,6 +657,15 @@ FileID SourceManager::createFileID(FileEntryRef SourceFile, "Cannot create converter for file '" + SourceFile.getName() + "': " + Converter.getError().message()); } + } else if (CLANG_DEFAULT_INPUT_ENCODING_IBM1047) { + // When IBM-1047 is the default and no file tag or explicit -finput-charset + // is provided, use IBM-1047 as the default source encoding + Converter = getOrCreateConverter("IBM-1047"); + if (!Converter) { + llvm::report_fatal_error( + "Cannot create IBM-1047 converter for file '" + SourceFile.getName() + "': " + + Converter.getError().message()); + } } #ifndef NDEBUG diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp index 323417e294d5a..5cd60fca92d44 100644 --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -7839,10 +7839,23 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA, // -finput_charset=UTF-8 is default. Reject others if (Arg *inputCharset = Args.getLastArg(options::OPT_finput_charset_EQ)) { StringRef value = inputCharset->getValue(); - if (!value.equals_insensitive("utf-8")) + bool isValid = value.equals_insensitive("utf-8"); +#if CLANG_DEFAULT_INPUT_ENCODING_IBM1047 + // When IBM-1047 default is enabled, also accept IBM-1047 + isValid = isValid || value.equals_insensitive("ibm-1047") || + value.equals_insensitive("ibm1047"); +#endif + if (!isValid) D.Diag(diag::err_drv_invalid_value) << inputCharset->getAsString(Args) << value; } +#if CLANG_DEFAULT_INPUT_ENCODING_IBM1047 + else { + // When IBM-1047 default is enabled and no explicit charset is specified, + // set IBM-1047 as the default + CmdArgs.push_back("-finput-charset=IBM-1047"); + } +#endif // -fexec_charset=UTF-8 is default. Reject others if (Arg *execCharset = Args.getLastArg(options::OPT_fexec_charset_EQ)) { >From 4d9787f33c4092709094db4f509b2d977804b975 Mon Sep 17 00:00:00 2001 From: alisonzhang <[email protected]> Date: Wed, 24 Jun 2026 21:48:42 -0400 Subject: [PATCH 12/14] retrun type --- llvm/include/llvm/Support/AutoConvert.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/llvm/include/llvm/Support/AutoConvert.h b/llvm/include/llvm/Support/AutoConvert.h index 8797664b1b337..5d3849f765b0f 100644 --- a/llvm/include/llvm/Support/AutoConvert.h +++ b/llvm/include/llvm/Support/AutoConvert.h @@ -114,19 +114,19 @@ getEncodingNameFromFileTag(const Twine &FileName, const int FD = -1) { __ccsid_t Tag = *TagOrErr; if (Tag == 0) - return {}; // Return empty string for no tag + return SmallString<32>(); // Return empty string for no tag if (Tag == 1208) - return {"utf-8"}; + return SmallString<32>("utf-8"); if (Tag == 1047) - return {"ibm-1047"}; + return SmallString<32>("ibm-1047"); SmallString<32> Result; raw_svector_ostream(Result) << Tag; return Result; #else - return {}; // Return empty string for non-MVS platforms + return SmallString<32>(); // Return empty string for non-MVS platforms #endif } >From e0061c18fb442f3556c5e1329f08e2fef7732512 Mon Sep 17 00:00:00 2001 From: alisonzhang <[email protected]> Date: Wed, 24 Jun 2026 21:53:28 -0400 Subject: [PATCH 13/14] fix error for smallstring --- llvm/include/llvm/Support/AutoConvert.h | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/include/llvm/Support/AutoConvert.h b/llvm/include/llvm/Support/AutoConvert.h index 5d3849f765b0f..b437b157b7725 100644 --- a/llvm/include/llvm/Support/AutoConvert.h +++ b/llvm/include/llvm/Support/AutoConvert.h @@ -18,6 +18,7 @@ #include <_Ccsid.h> #endif #ifdef __cplusplus +#include "llvm/ADT/SmallString.h" #include "llvm/ADT/Twine.h" #include "llvm/Support/Error.h" #include <system_error> >From a2cc01c801024743a6e3b0f883d8b618cb734c66 Mon Sep 17 00:00:00 2001 From: alisonzhang <[email protected]> Date: Wed, 24 Jun 2026 22:02:37 -0400 Subject: [PATCH 14/14] add flag --- clang/lib/Basic/SourceManager.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/clang/lib/Basic/SourceManager.cpp b/clang/lib/Basic/SourceManager.cpp index f8100aa7b2e32..8866861ca3a5d 100644 --- a/clang/lib/Basic/SourceManager.cpp +++ b/clang/lib/Basic/SourceManager.cpp @@ -10,6 +10,7 @@ // //===----------------------------------------------------------------------===// +#include "clang/Config/config.h" #include "clang/Basic/SourceManager.h" #include "clang/Basic/Diagnostic.h" #include "clang/Basic/FileManager.h" _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
