https://github.com/abhina-sree updated https://github.com/llvm/llvm-project/pull/98652
>From e7be53314994b9a051ba2ff99dfd029937ebcc07 Mon Sep 17 00:00:00 2001 From: Abhina Sreeskantharajan <abhina.sreeskanthara...@ibm.com> Date: Fri, 12 Jul 2024 11:17:24 -0400 Subject: [PATCH 1/3] update autoconversion functionality to fix error: source file is not valid UTF-8 --- clang/include/clang/Basic/FileEntry.h | 9 ++++++ clang/lib/Basic/SourceManager.cpp | 25 ++++++++++++++++ llvm/include/llvm/Support/AutoConvert.h | 7 +++++ llvm/lib/Support/AutoConvert.cpp | 40 ++++++++++++++++++++++++- llvm/lib/Support/MemoryBuffer.cpp | 16 ++++++++-- llvm/lib/Support/VirtualFileSystem.cpp | 2 +- 6 files changed, 95 insertions(+), 4 deletions(-) diff --git a/clang/include/clang/Basic/FileEntry.h b/clang/include/clang/Basic/FileEntry.h index 68d4bf60930037..1fe6c3617582ce 100644 --- a/clang/include/clang/Basic/FileEntry.h +++ b/clang/include/clang/Basic/FileEntry.h @@ -70,6 +70,11 @@ class FileEntryRef { const FileEntry &getFileEntry() const { return *getBaseMapEntry().second->V.get<FileEntry *>(); } +#ifdef __MVS__ + FileEntry &getFileEntry() { + return *getBaseMapEntry().second->V.get<FileEntry *>(); + } +#endif DirectoryEntryRef getDir() const { return ME->second->Dir; } inline off_t getSize() const; @@ -323,6 +328,10 @@ class FileEntry { StringRef tryGetRealPathName() const { return RealPathName; } off_t getSize() const { return Size; } +#ifdef __MVS__ + // Size may increase due to potential z/OS EBCDIC -> UTF-8 conversion. + void setSize(off_t NewSize) { Size = NewSize; } +#endif unsigned getUID() const { return UID; } const llvm::sys::fs::UniqueID &getUniqueID() const { return UniqueID; } time_t getModificationTime() const { return ModTime; } diff --git a/clang/lib/Basic/SourceManager.cpp b/clang/lib/Basic/SourceManager.cpp index d6ec26af80aadd..44b56a352dfc4e 100644 --- a/clang/lib/Basic/SourceManager.cpp +++ b/clang/lib/Basic/SourceManager.cpp @@ -24,6 +24,7 @@ #include "llvm/ADT/StringRef.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/Support/Allocator.h" +#include "llvm/Support/AutoConvert.h" #include "llvm/Support/Capacity.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/Endian.h" @@ -166,8 +167,15 @@ ContentCache::getBufferOrNone(DiagnosticsEngine &Diag, FileManager &FM, // Unless this is a named pipe (in which case we can handle a mismatch), // check that the file's size is the same as in the file entry (which may // have come from a stat cache). +#ifndef __MVS__ if (!ContentsEntry->isNamedPipe() && Buffer->getBufferSize() != (size_t)ContentsEntry->getSize()) { +#else + // The buffer will always be larger than the file size on z/OS in the presence + // of characters outside the base character set. + if (!ContentsEntry->isNamedPipe() && + Buffer->getBufferSize() < (size_t)ContentsEntry->getSize()) { +#endif if (Diag.isDiagnosticInFlight()) Diag.SetDelayedDiagnostic(diag::err_file_modified, ContentsEntry->getName()); @@ -617,6 +625,23 @@ FileID SourceManager::createFileIDImpl(ContentCache &File, StringRef Filename, return FileID::get(LoadedID); } unsigned FileSize = File.getSize(); +#ifdef __MVS__ + llvm::ErrorOr<bool> NeedConversion = + llvm::needConversion(Filename.str().c_str()); + if (NeedConversion && *NeedConversion) { + // Buffer size may increase due to potential z/OS EBCDIC to UTF-8 + // conversion. + if (std::optional<llvm::MemoryBufferRef> Buffer = + File.getBufferOrNone(Diag, getFileManager())) { + unsigned BufSize = Buffer->getBufferSize(); + if (BufSize > FileSize) { + if (File.ContentsEntry.has_value()) + File.ContentsEntry->getFileEntry().setSize(BufSize); + FileSize = BufSize; + } + } + } +#endif if (!(NextLocalOffset + FileSize + 1 > NextLocalOffset && NextLocalOffset + FileSize + 1 <= CurrentLoadedOffset)) { Diag.Report(IncludePos, diag::err_sloc_space_too_large); diff --git a/llvm/include/llvm/Support/AutoConvert.h b/llvm/include/llvm/Support/AutoConvert.h index 6f45c4683f7775..a9b3e0357589bc 100644 --- a/llvm/include/llvm/Support/AutoConvert.h +++ b/llvm/include/llvm/Support/AutoConvert.h @@ -17,6 +17,7 @@ #ifdef __MVS__ #include <_Ccsid.h> #ifdef __cplusplus +#include "llvm/Support/ErrorOr.h" #include <system_error> #endif // __cplusplus @@ -52,6 +53,12 @@ std::error_code restorezOSStdHandleAutoConversion(int FD); /// \brief Set the tag information for a file descriptor. std::error_code setzOSFileTag(int FD, int CCSID, bool Text); +// Get the the tag ccsid for a file name or a file descriptor. +ErrorOr<__ccsid_t> getFileTag(const char *FileName, const int FD = -1); + +// Query the file tag to determine if it needs conversion to UTF-8 codepage. +ErrorOr<bool> needConversion(const char *FileName, const int FD = -1); + } // namespace llvm #endif // __cplusplus diff --git a/llvm/lib/Support/AutoConvert.cpp b/llvm/lib/Support/AutoConvert.cpp index 66570735f8fc88..4b0463ee5c92ac 100644 --- a/llvm/lib/Support/AutoConvert.cpp +++ b/llvm/lib/Support/AutoConvert.cpp @@ -20,6 +20,8 @@ #include <sys/stat.h> #include <unistd.h> +using namespace llvm; + static int savedStdHandleAutoConversionMode[3] = {-1, -1, -1}; int disablezOSAutoConversion(int FD) { @@ -116,4 +118,40 @@ std::error_code llvm::setzOSFileTag(int FD, int CCSID, bool Text) { return std::error_code(); } -#endif // __MVS__ +ErrorOr<__ccsid_t> llvm::getFileTag(const char *FileName, const int FD) { + // If we have a file descriptor, use it to find out file tagging. Otherwise we + // need to use stat() with the file path. + if (FD != -1) { + struct f_cnvrt Query = { + QUERYCVT, // cvtcmd + 0, // pccsid + 0, // fccsid + }; + if (fcntl(FD, F_CONTROL_CVT, &Query) == -1) + return std::error_code(errno, std::generic_category()); + return Query.fccsid; + } + struct stat Attr; + if (stat(FileName, &Attr) == -1) + return std::error_code(errno, std::generic_category()); + return Attr.st_tag.ft_ccsid; +} + +ErrorOr<bool> llvm::needConversion(const char *FileName, const int FD) { + ErrorOr<__ccsid_t> Ccsid = getFileTag(FileName, FD); + if (std::error_code EC = Ccsid.getError()) + return EC; + // We don't need conversion for UTF-8 tagged files or binary files. + // TODO: Remove the assumption of ISO8859-1 = UTF-8 here when we fully resolve + // problems related to UTF-8 tagged source files. + switch (*Ccsid) { + case CCSID_UTF_8: + case CCSID_ISO8859_1: + case FT_BINARY: + return false; + default: + return true; + } +} + +#endif //__MVS__ diff --git a/llvm/lib/Support/MemoryBuffer.cpp b/llvm/lib/Support/MemoryBuffer.cpp index aea81964ba9fde..be351129879ea4 100644 --- a/llvm/lib/Support/MemoryBuffer.cpp +++ b/llvm/lib/Support/MemoryBuffer.cpp @@ -362,6 +362,11 @@ static bool shouldUseMmap(sys::fs::file_t FD, bool RequiresNullTerminator, int PageSize, bool IsVolatile) { +#if defined(__MVS__) + // zOS Enhanced ASCII auto convert does not support mmap. + return false; +#endif + // mmap may leave the buffer without null terminator if the file size changed // by the time the last page is mapped in, so avoid it if the file size is // likely to change. @@ -504,9 +509,16 @@ getOpenFileImpl(sys::fs::file_t FD, const Twine &Filename, uint64_t FileSize, } #ifdef __MVS__ - // Set codepage auto-conversion for z/OS. - if (auto EC = llvm::enablezOSAutoConversion(FD)) + ErrorOr<bool> NeedConversion = needConversion(Filename.str().c_str(), FD); + if (std::error_code EC = NeedConversion.getError()) return EC; + // File size may increase due to EBCDIC -> UTF-8 conversion, therefore we + // cannot trust the file size and we create the memory buffer by copying + // off the stream. + // Note: This only works with the assumption of reading a full file (i.e, + // Offset == 0 and MapSize == FileSize). Reading a file slice does not work. + if (Offset == 0 && MapSize == FileSize && *NeedConversion) + return getMemoryBufferForStream(FD, Filename); #endif auto Buf = diff --git a/llvm/lib/Support/VirtualFileSystem.cpp b/llvm/lib/Support/VirtualFileSystem.cpp index 928c0b5a24ed65..64585ec5b3f5f5 100644 --- a/llvm/lib/Support/VirtualFileSystem.cpp +++ b/llvm/lib/Support/VirtualFileSystem.cpp @@ -326,7 +326,7 @@ ErrorOr<std::unique_ptr<File>> RealFileSystem::openFileForRead(const Twine &Name) { SmallString<256> RealName, Storage; Expected<file_t> FDOrErr = sys::fs::openNativeFileForRead( - adjustPath(Name, Storage), sys::fs::OF_None, &RealName); + adjustPath(Name, Storage), sys::fs::OF_Text, &RealName); if (!FDOrErr) return errorToErrorCode(FDOrErr.takeError()); return std::unique_ptr<File>( >From ea68964c6be407b2efb9831ccbc3258b412489ea Mon Sep 17 00:00:00 2001 From: Abhina Sreeskantharajan <abhina.sreeskanthara...@ibm.com> Date: Mon, 9 Sep 2024 13:09:16 -0400 Subject: [PATCH 2/3] rename functions --- llvm/include/llvm/Support/AutoConvert.h | 4 ++-- llvm/lib/Support/AutoConvert.cpp | 6 +++--- llvm/lib/Support/MemoryBuffer.cpp | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/llvm/include/llvm/Support/AutoConvert.h b/llvm/include/llvm/Support/AutoConvert.h index a9b3e0357589bc..f6ace802dab2e3 100644 --- a/llvm/include/llvm/Support/AutoConvert.h +++ b/llvm/include/llvm/Support/AutoConvert.h @@ -54,10 +54,10 @@ std::error_code restorezOSStdHandleAutoConversion(int FD); std::error_code setzOSFileTag(int FD, int CCSID, bool Text); // Get the the tag ccsid for a file name or a file descriptor. -ErrorOr<__ccsid_t> getFileTag(const char *FileName, const int FD = -1); +ErrorOr<__ccsid_t> getzOSFileTag(const char *FileName, const int FD = -1); // Query the file tag to determine if it needs conversion to UTF-8 codepage. -ErrorOr<bool> needConversion(const char *FileName, const int FD = -1); +ErrorOr<bool> needzOSConversion(const char *FileName, const int FD = -1); } // namespace llvm #endif // __cplusplus diff --git a/llvm/lib/Support/AutoConvert.cpp b/llvm/lib/Support/AutoConvert.cpp index 4b0463ee5c92ac..f7918548df1d0d 100644 --- a/llvm/lib/Support/AutoConvert.cpp +++ b/llvm/lib/Support/AutoConvert.cpp @@ -118,7 +118,7 @@ std::error_code llvm::setzOSFileTag(int FD, int CCSID, bool Text) { return std::error_code(); } -ErrorOr<__ccsid_t> llvm::getFileTag(const char *FileName, const int FD) { +ErrorOr<__ccsid_t> llvm::getzOSFileTag(const char *FileName, const int FD) { // If we have a file descriptor, use it to find out file tagging. Otherwise we // need to use stat() with the file path. if (FD != -1) { @@ -137,8 +137,8 @@ ErrorOr<__ccsid_t> llvm::getFileTag(const char *FileName, const int FD) { return Attr.st_tag.ft_ccsid; } -ErrorOr<bool> llvm::needConversion(const char *FileName, const int FD) { - ErrorOr<__ccsid_t> Ccsid = getFileTag(FileName, FD); +ErrorOr<bool> llvm::needzOSConversion(const char *FileName, const int FD) { + ErrorOr<__ccsid_t> Ccsid = getzOSFileTag(FileName, FD); if (std::error_code EC = Ccsid.getError()) return EC; // We don't need conversion for UTF-8 tagged files or binary files. diff --git a/llvm/lib/Support/MemoryBuffer.cpp b/llvm/lib/Support/MemoryBuffer.cpp index be351129879ea4..25a15b70829050 100644 --- a/llvm/lib/Support/MemoryBuffer.cpp +++ b/llvm/lib/Support/MemoryBuffer.cpp @@ -509,7 +509,7 @@ getOpenFileImpl(sys::fs::file_t FD, const Twine &Filename, uint64_t FileSize, } #ifdef __MVS__ - ErrorOr<bool> NeedConversion = needConversion(Filename.str().c_str(), FD); + ErrorOr<bool> NeedConversion = needzOSConversion(Filename.str().c_str(), FD); if (std::error_code EC = NeedConversion.getError()) return EC; // File size may increase due to EBCDIC -> UTF-8 conversion, therefore we >From 358fd3fee19d3bde61b5437c01a7ccc85c3ab5bd Mon Sep 17 00:00:00 2001 From: Abhina Sreeskantharajan <abhina.sreeskanthara...@ibm.com> Date: Mon, 9 Sep 2024 15:22:52 -0400 Subject: [PATCH 3/3] revert text assumption --- clang/lib/Basic/SourceManager.cpp | 2 +- llvm/lib/Support/VirtualFileSystem.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/clang/lib/Basic/SourceManager.cpp b/clang/lib/Basic/SourceManager.cpp index 44b56a352dfc4e..aa8553d3b16604 100644 --- a/clang/lib/Basic/SourceManager.cpp +++ b/clang/lib/Basic/SourceManager.cpp @@ -627,7 +627,7 @@ FileID SourceManager::createFileIDImpl(ContentCache &File, StringRef Filename, unsigned FileSize = File.getSize(); #ifdef __MVS__ llvm::ErrorOr<bool> NeedConversion = - llvm::needConversion(Filename.str().c_str()); + llvm::needzOSConversion(Filename.str().c_str()); if (NeedConversion && *NeedConversion) { // Buffer size may increase due to potential z/OS EBCDIC to UTF-8 // conversion. diff --git a/llvm/lib/Support/VirtualFileSystem.cpp b/llvm/lib/Support/VirtualFileSystem.cpp index 64585ec5b3f5f5..928c0b5a24ed65 100644 --- a/llvm/lib/Support/VirtualFileSystem.cpp +++ b/llvm/lib/Support/VirtualFileSystem.cpp @@ -326,7 +326,7 @@ ErrorOr<std::unique_ptr<File>> RealFileSystem::openFileForRead(const Twine &Name) { SmallString<256> RealName, Storage; Expected<file_t> FDOrErr = sys::fs::openNativeFileForRead( - adjustPath(Name, Storage), sys::fs::OF_Text, &RealName); + adjustPath(Name, Storage), sys::fs::OF_None, &RealName); if (!FDOrErr) return errorToErrorCode(FDOrErr.takeError()); return std::unique_ptr<File>( _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits