https://github.com/abhina-sree updated https://github.com/llvm/llvm-project/pull/98652
>From 4583cacec6daa1e980d5149be063120b34731f4c Mon Sep 17 00:00:00 2001 From: Abhina Sreeskantharajan <abhina.sreeskanthara...@ibm.com> Date: Fri, 12 Jul 2024 11:17:24 -0400 Subject: [PATCH 1/2] update autoconversion functionality to fix error: source file is not valid UTF-8 --- clang/include/clang/Basic/FileEntry.h | 9 ++++++ clang/lib/Basic/SourceManager.cpp | 26 +++++++++++++++- llvm/include/llvm/Support/AutoConvert.h | 11 +++++-- llvm/lib/Support/AutoConvert.cpp | 40 ++++++++++++++++++++++++- llvm/lib/Support/MemoryBuffer.cpp | 16 ++++++++-- 5 files changed, 96 insertions(+), 6 deletions(-) diff --git a/clang/include/clang/Basic/FileEntry.h b/clang/include/clang/Basic/FileEntry.h index 68d4bf60930037..1fe6c3617582ce 100644 --- a/clang/include/clang/Basic/FileEntry.h +++ b/clang/include/clang/Basic/FileEntry.h @@ -70,6 +70,11 @@ class FileEntryRef { const FileEntry &getFileEntry() const { return *getBaseMapEntry().second->V.get<FileEntry *>(); } +#ifdef __MVS__ + FileEntry &getFileEntry() { + return *getBaseMapEntry().second->V.get<FileEntry *>(); + } +#endif DirectoryEntryRef getDir() const { return ME->second->Dir; } inline off_t getSize() const; @@ -323,6 +328,10 @@ class FileEntry { StringRef tryGetRealPathName() const { return RealPathName; } off_t getSize() const { return Size; } +#ifdef __MVS__ + // Size may increase due to potential z/OS EBCDIC -> UTF-8 conversion. + void setSize(off_t NewSize) { Size = NewSize; } +#endif unsigned getUID() const { return UID; } const llvm::sys::fs::UniqueID &getUniqueID() const { return UniqueID; } time_t getModificationTime() const { return ModTime; } diff --git a/clang/lib/Basic/SourceManager.cpp b/clang/lib/Basic/SourceManager.cpp index 6e588ce63d813f..21cf40fc0cb9e2 100644 --- a/clang/lib/Basic/SourceManager.cpp +++ b/clang/lib/Basic/SourceManager.cpp @@ -24,6 +24,7 @@ #include "llvm/ADT/StringRef.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/Support/Allocator.h" +#include "llvm/Support/AutoConvert.h" #include "llvm/Support/Capacity.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/Endian.h" @@ -156,10 +157,16 @@ ContentCache::getBufferOrNone(DiagnosticsEngine &Diag, FileManager &FM, // Unless this is a named pipe (in which case we can handle a mismatch), // check that the file's size is the same as in the file entry (which may // have come from a stat cache). +#ifndef __MVS__ if (!ContentsEntry->isNamedPipe() && Buffer->getBufferSize() != (size_t)ContentsEntry->getSize()) { +#else + // The buffer will always be larger than the file size on z/OS in the presence + // of characters outside the base character set. + if (!ContentsEntry->isNamedPipe() && + Buffer->getBufferSize() < (size_t)ContentsEntry->getSize()) { +#endif Diag.Report(Loc, diag::err_file_modified) << ContentsEntry->getName(); - return std::nullopt; } @@ -602,6 +609,23 @@ FileID SourceManager::createFileIDImpl(ContentCache &File, StringRef Filename, return FileID::get(LoadedID); } unsigned FileSize = File.getSize(); +#ifdef __MVS__ + llvm::ErrorOr<bool> NeedConversion = + llvm::needzOSConversion(Filename.str().c_str()); + if (NeedConversion && *NeedConversion) { + // Buffer size may increase due to potential z/OS EBCDIC to UTF-8 + // conversion. + if (std::optional<llvm::MemoryBufferRef> Buffer = + File.getBufferOrNone(Diag, getFileManager())) { + unsigned BufSize = Buffer->getBufferSize(); + if (BufSize > FileSize) { + if (File.ContentsEntry.has_value()) + File.ContentsEntry->getFileEntry().setSize(BufSize); + FileSize = BufSize; + } + } + } +#endif if (!(NextLocalOffset + FileSize + 1 > NextLocalOffset && NextLocalOffset + FileSize + 1 <= CurrentLoadedOffset)) { Diag.Report(IncludePos, diag::err_sloc_space_too_large); diff --git a/llvm/include/llvm/Support/AutoConvert.h b/llvm/include/llvm/Support/AutoConvert.h index 65ac576ae5676b..5d6d9394ef1d81 100644 --- a/llvm/include/llvm/Support/AutoConvert.h +++ b/llvm/include/llvm/Support/AutoConvert.h @@ -17,6 +17,7 @@ #ifdef __MVS__ #include <_Ccsid.h> #ifdef __cplusplus +#include "llvm/Support/ErrorOr.h" #include <system_error> #endif /* __cplusplus */ @@ -54,8 +55,14 @@ std::error_code restorezOSStdHandleAutoConversion(int FD); /** \brief Set the tag information for a file descriptor. */ std::error_code setzOSFileTag(int FD, int CCSID, bool Text); -} /* namespace llvm */ -#endif /* __cplusplus */ +// Get the the tag ccsid for a file name or a file descriptor. +ErrorOr<__ccsid_t> getzOSFileTag(const char *FileName, const int FD = -1); + +// Query the file tag to determine if it needs conversion to UTF-8 codepage. +ErrorOr<bool> needzOSConversion(const char *FileName, const int FD = -1); + +} // namespace llvm +#endif // __cplusplus #endif /* __MVS__ */ diff --git a/llvm/lib/Support/AutoConvert.cpp b/llvm/lib/Support/AutoConvert.cpp index 66570735f8fc88..f7918548df1d0d 100644 --- a/llvm/lib/Support/AutoConvert.cpp +++ b/llvm/lib/Support/AutoConvert.cpp @@ -20,6 +20,8 @@ #include <sys/stat.h> #include <unistd.h> +using namespace llvm; + static int savedStdHandleAutoConversionMode[3] = {-1, -1, -1}; int disablezOSAutoConversion(int FD) { @@ -116,4 +118,40 @@ std::error_code llvm::setzOSFileTag(int FD, int CCSID, bool Text) { return std::error_code(); } -#endif // __MVS__ +ErrorOr<__ccsid_t> llvm::getzOSFileTag(const char *FileName, const int FD) { + // If we have a file descriptor, use it to find out file tagging. Otherwise we + // need to use stat() with the file path. + if (FD != -1) { + struct f_cnvrt Query = { + QUERYCVT, // cvtcmd + 0, // pccsid + 0, // fccsid + }; + if (fcntl(FD, F_CONTROL_CVT, &Query) == -1) + return std::error_code(errno, std::generic_category()); + return Query.fccsid; + } + struct stat Attr; + if (stat(FileName, &Attr) == -1) + return std::error_code(errno, std::generic_category()); + return Attr.st_tag.ft_ccsid; +} + +ErrorOr<bool> llvm::needzOSConversion(const char *FileName, const int FD) { + ErrorOr<__ccsid_t> Ccsid = getzOSFileTag(FileName, FD); + if (std::error_code EC = Ccsid.getError()) + return EC; + // We don't need conversion for UTF-8 tagged files or binary files. + // TODO: Remove the assumption of ISO8859-1 = UTF-8 here when we fully resolve + // problems related to UTF-8 tagged source files. + switch (*Ccsid) { + case CCSID_UTF_8: + case CCSID_ISO8859_1: + case FT_BINARY: + return false; + default: + return true; + } +} + +#endif //__MVS__ diff --git a/llvm/lib/Support/MemoryBuffer.cpp b/llvm/lib/Support/MemoryBuffer.cpp index 7ea68ee4cafd76..e2044bcc4e4f08 100644 --- a/llvm/lib/Support/MemoryBuffer.cpp +++ b/llvm/lib/Support/MemoryBuffer.cpp @@ -361,6 +361,11 @@ static bool shouldUseMmap(sys::fs::file_t FD, bool RequiresNullTerminator, int PageSize, bool IsVolatile) { +#if defined(__MVS__) + // zOS Enhanced ASCII auto convert does not support mmap. + return false; +#endif + // mmap may leave the buffer without null terminator if the file size changed // by the time the last page is mapped in, so avoid it if the file size is // likely to change. @@ -503,9 +508,16 @@ getOpenFileImpl(sys::fs::file_t FD, const Twine &Filename, uint64_t FileSize, } #ifdef __MVS__ - // Set codepage auto-conversion for z/OS. - if (auto EC = llvm::enablezOSAutoConversion(FD)) + ErrorOr<bool> NeedConversion = needzOSConversion(Filename.str().c_str(), FD); + if (std::error_code EC = NeedConversion.getError()) return EC; + // File size may increase due to EBCDIC -> UTF-8 conversion, therefore we + // cannot trust the file size and we create the memory buffer by copying + // off the stream. + // Note: This only works with the assumption of reading a full file (i.e, + // Offset == 0 and MapSize == FileSize). Reading a file slice does not work. + if (Offset == 0 && MapSize == FileSize && *NeedConversion) + return getMemoryBufferForStream(FD, Filename); #endif auto Buf = >From 97862dd71017c6069cf3d7a10f0fadc5e6a451e0 Mon Sep 17 00:00:00 2001 From: Abhina Sreeskantharajan <abhina.sreeskanthara...@ibm.com> Date: Fri, 6 Dec 2024 12:10:29 -0500 Subject: [PATCH 2/2] remove #ifdefs --- clang/include/clang/Basic/FileEntry.h | 10 +++++----- clang/lib/Basic/SourceManager.cpp | 23 +++++++++++++---------- 2 files changed, 18 insertions(+), 15 deletions(-) diff --git a/clang/include/clang/Basic/FileEntry.h b/clang/include/clang/Basic/FileEntry.h index 1fe6c3617582ce..7f24eb33084a19 100644 --- a/clang/include/clang/Basic/FileEntry.h +++ b/clang/include/clang/Basic/FileEntry.h @@ -70,11 +70,13 @@ class FileEntryRef { const FileEntry &getFileEntry() const { return *getBaseMapEntry().second->V.get<FileEntry *>(); } -#ifdef __MVS__ - FileEntry &getFileEntry() { + + // This is a non const version of getFileEntry() which is used if the buffer + // size needs to be increased due to potential z/OS EBCDIC -> UTF-8 conversion + FileEntry &getFileEntryToUpdate() { return *getBaseMapEntry().second->V.get<FileEntry *>(); } -#endif + DirectoryEntryRef getDir() const { return ME->second->Dir; } inline off_t getSize() const; @@ -328,10 +330,8 @@ class FileEntry { StringRef tryGetRealPathName() const { return RealPathName; } off_t getSize() const { return Size; } -#ifdef __MVS__ // Size may increase due to potential z/OS EBCDIC -> UTF-8 conversion. void setSize(off_t NewSize) { Size = NewSize; } -#endif unsigned getUID() const { return UID; } const llvm::sys::fs::UniqueID &getUniqueID() const { return UniqueID; } time_t getModificationTime() const { return ModTime; } diff --git a/clang/lib/Basic/SourceManager.cpp b/clang/lib/Basic/SourceManager.cpp index 21cf40fc0cb9e2..f56f9943e24430 100644 --- a/clang/lib/Basic/SourceManager.cpp +++ b/clang/lib/Basic/SourceManager.cpp @@ -157,16 +157,13 @@ ContentCache::getBufferOrNone(DiagnosticsEngine &Diag, FileManager &FM, // Unless this is a named pipe (in which case we can handle a mismatch), // check that the file's size is the same as in the file entry (which may // have come from a stat cache). -#ifndef __MVS__ - if (!ContentsEntry->isNamedPipe() && - Buffer->getBufferSize() != (size_t)ContentsEntry->getSize()) { -#else // The buffer will always be larger than the file size on z/OS in the presence // of characters outside the base character set. + assert(Buffer->getBufferSize() <= (size_t)ContentsEntry->getSize()); if (!ContentsEntry->isNamedPipe() && Buffer->getBufferSize() < (size_t)ContentsEntry->getSize()) { -#endif Diag.Report(Loc, diag::err_file_modified) << ContentsEntry->getName(); + return std::nullopt; } @@ -590,6 +587,15 @@ SourceManager::getOrCreateFileID(FileEntryRef SourceFile, FileCharacter); } +/// Helper function to determine if an input file requires conversion +llvm::ErrorOr<bool> needConversion(StringRef Filename) { +#ifdef __MVS__ + return llvm::needzOSConversion(Filename.str().c_str()); +#else + return false; +#endif +} + /// createFileID - Create a new FileID for the specified ContentCache and /// include position. This works regardless of whether the ContentCache /// corresponds to a file or some other input source. @@ -609,9 +615,7 @@ FileID SourceManager::createFileIDImpl(ContentCache &File, StringRef Filename, return FileID::get(LoadedID); } unsigned FileSize = File.getSize(); -#ifdef __MVS__ - llvm::ErrorOr<bool> NeedConversion = - llvm::needzOSConversion(Filename.str().c_str()); + llvm::ErrorOr<bool> NeedConversion = needConversion(Filename); if (NeedConversion && *NeedConversion) { // Buffer size may increase due to potential z/OS EBCDIC to UTF-8 // conversion. @@ -620,12 +624,11 @@ FileID SourceManager::createFileIDImpl(ContentCache &File, StringRef Filename, unsigned BufSize = Buffer->getBufferSize(); if (BufSize > FileSize) { if (File.ContentsEntry.has_value()) - File.ContentsEntry->getFileEntry().setSize(BufSize); + File.ContentsEntry->getFileEntryToUpdate().setSize(BufSize); FileSize = BufSize; } } } -#endif if (!(NextLocalOffset + FileSize + 1 > NextLocalOffset && NextLocalOffset + FileSize + 1 <= CurrentLoadedOffset)) { Diag.Report(IncludePos, diag::err_sloc_space_too_large); _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits