https://github.com/abhina-sree created https://github.com/llvm/llvm-project/pull/98652
This fixes the following error when reading source and header files: error: source file is not valid UTF-8 >From 539af083a22ccdeaa64ccc85d6ffc08b48c9432f Mon Sep 17 00:00:00 2001 From: Abhina Sreeskantharajan <abhina.sreeskanthara...@ibm.com> Date: Fri, 12 Jul 2024 11:17:24 -0400 Subject: [PATCH] update autoconversion functionality to fix error: source file is not valid UTF-8 --- clang/include/clang/Basic/FileEntry.h | 9 ++++++ clang/lib/Basic/SourceManager.cpp | 25 ++++++++++++++++ llvm/include/llvm/Support/AutoConvert.h | 7 +++++ llvm/lib/Support/AutoConvert.cpp | 40 ++++++++++++++++++++++++- llvm/lib/Support/MemoryBuffer.cpp | 16 ++++++++-- llvm/lib/Support/VirtualFileSystem.cpp | 2 +- 6 files changed, 95 insertions(+), 4 deletions(-) diff --git a/clang/include/clang/Basic/FileEntry.h b/clang/include/clang/Basic/FileEntry.h index 68d4bf6093003..56684cc8828d9 100644 --- a/clang/include/clang/Basic/FileEntry.h +++ b/clang/include/clang/Basic/FileEntry.h @@ -70,6 +70,11 @@ class FileEntryRef { const FileEntry &getFileEntry() const { return *getBaseMapEntry().second->V.get<FileEntry *>(); } +#ifdef __MVS__ + FileEntry &getFileEntry() { + return *getBaseMapEntry().second->V.get<FileEntry *>(); + } +#endif DirectoryEntryRef getDir() const { return ME->second->Dir; } inline off_t getSize() const; @@ -323,6 +328,10 @@ class FileEntry { StringRef tryGetRealPathName() const { return RealPathName; } off_t getSize() const { return Size; } +#ifdef __MVS__ + // Size may increase due to potential z/OS EBCDIC -> UTF-8 conversion. + void setSize(off_t NewSize) { Size = NewSize; } +#endif unsigned getUID() const { return UID; } const llvm::sys::fs::UniqueID &getUniqueID() const { return UniqueID; } time_t getModificationTime() const { return ModTime; } diff --git a/clang/lib/Basic/SourceManager.cpp b/clang/lib/Basic/SourceManager.cpp index 533a9fe88a215..1b6197e4fa054 100644 --- a/clang/lib/Basic/SourceManager.cpp +++ b/clang/lib/Basic/SourceManager.cpp @@ -24,6 +24,7 @@ #include "llvm/ADT/StringRef.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/Support/Allocator.h" +#include "llvm/Support/AutoConvert.h" #include "llvm/Support/Capacity.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/Endian.h" @@ -166,8 +167,15 @@ ContentCache::getBufferOrNone(DiagnosticsEngine &Diag, FileManager &FM, // Unless this is a named pipe (in which case we can handle a mismatch), // check that the file's size is the same as in the file entry (which may // have come from a stat cache). +#ifndef __MVS__ if (!ContentsEntry->isNamedPipe() && Buffer->getBufferSize() != (size_t)ContentsEntry->getSize()) { +#else + // The buffer will always be larger than the file size on z/OS in the presence + // of characters outside the base character set. + if (!ContentsEntry->isNamedPipe() && + Buffer->getBufferSize() < (size_t)ContentsEntry->getSize()) { +#endif if (Diag.isDiagnosticInFlight()) Diag.SetDelayedDiagnostic(diag::err_file_modified, ContentsEntry->getName()); @@ -616,6 +624,23 @@ FileID SourceManager::createFileIDImpl(ContentCache &File, StringRef Filename, return FileID::get(LoadedID); } unsigned FileSize = File.getSize(); +#ifdef __MVS__ + llvm::ErrorOr<bool> NeedConversion = + llvm::needConversion(Filename.str().c_str()); + if (NeedConversion && *NeedConversion) { + // Buffer size may increase due to potential z/OS EBCDIC to UTF-8 + // conversion. + if (std::optional<llvm::MemoryBufferRef> Buffer = + File.getBufferOrNone(Diag, getFileManager())) { + unsigned BufSize = Buffer->getBufferSize(); + if (BufSize > FileSize) { + if (File.ContentsEntry.has_value()) + File.ContentsEntry->getFileEntry().setSize(BufSize); + FileSize = BufSize; + } + } + } +#endif if (!(NextLocalOffset + FileSize + 1 > NextLocalOffset && NextLocalOffset + FileSize + 1 <= CurrentLoadedOffset)) { Diag.Report(IncludePos, diag::err_sloc_space_too_large); diff --git a/llvm/include/llvm/Support/AutoConvert.h b/llvm/include/llvm/Support/AutoConvert.h index f4fe80b22a876..784d8f2d3bf75 100644 --- a/llvm/include/llvm/Support/AutoConvert.h +++ b/llvm/include/llvm/Support/AutoConvert.h @@ -17,6 +17,7 @@ #ifdef __MVS__ #include <_Ccsid.h> #ifdef __cplusplus +#include "llvm/Support/ErrorOr.h" #include <system_error> #endif // __cplusplus @@ -52,6 +53,12 @@ std::error_code restoreStdHandleAutoConversion(int FD); /// \brief Set the tag information for a file descriptor. std::error_code setFileTag(int FD, int CCSID, bool Text); +// Get the the tag ccsid for a file name or a file descriptor. +ErrorOr<__ccsid_t> getFileTag(const char *FileName, const int FD = -1); + +// Query the file tag to determine if it needs conversion to UTF-8 codepage. +ErrorOr<bool> needConversion(const char *FileName, const int FD = -1); + } // namespace llvm #endif // __cplusplus diff --git a/llvm/lib/Support/AutoConvert.cpp b/llvm/lib/Support/AutoConvert.cpp index c509284ee916e..0792265733a43 100644 --- a/llvm/lib/Support/AutoConvert.cpp +++ b/llvm/lib/Support/AutoConvert.cpp @@ -20,6 +20,8 @@ #include <sys/stat.h> #include <unistd.h> +using namespace llvm; + static int savedStdHandleAutoConversionMode[3] = {-1, -1, -1}; int disableAutoConversion(int FD) { @@ -116,4 +118,40 @@ std::error_code llvm::setFileTag(int FD, int CCSID, bool Text) { return std::error_code(); } -#endif // __MVS__ +ErrorOr<__ccsid_t> llvm::getFileTag(const char *FileName, const int FD) { + // If we have a file descriptor, use it to find out file tagging. Otherwise we + // need to use stat() with the file path. + if (FD != -1) { + struct f_cnvrt Query = { + QUERYCVT, // cvtcmd + 0, // pccsid + 0, // fccsid + }; + if (fcntl(FD, F_CONTROL_CVT, &Query) == -1) + return std::error_code(errno, std::generic_category()); + return Query.fccsid; + } + struct stat Attr; + if (stat(FileName, &Attr) == -1) + return std::error_code(errno, std::generic_category()); + return Attr.st_tag.ft_ccsid; +} + +ErrorOr<bool> llvm::needConversion(const char *FileName, const int FD) { + ErrorOr<__ccsid_t> Ccsid = getFileTag(FileName, FD); + if (std::error_code EC = Ccsid.getError()) + return EC; + // We don't need conversion for UTF-8 tagged files or binary files. + // TODO: Remove the assumption of ISO8859-1 = UTF-8 here when we fully resolve + // problems related to UTF-8 tagged source files. + switch (*Ccsid) { + case CCSID_UTF_8: + case CCSID_ISO8859_1: + case FT_BINARY: + return false; + default: + return true; + } +} + +#endif //__MVS__ diff --git a/llvm/lib/Support/MemoryBuffer.cpp b/llvm/lib/Support/MemoryBuffer.cpp index fb7e804fd7e84..be351129879ea 100644 --- a/llvm/lib/Support/MemoryBuffer.cpp +++ b/llvm/lib/Support/MemoryBuffer.cpp @@ -362,6 +362,11 @@ static bool shouldUseMmap(sys::fs::file_t FD, bool RequiresNullTerminator, int PageSize, bool IsVolatile) { +#if defined(__MVS__) + // zOS Enhanced ASCII auto convert does not support mmap. + return false; +#endif + // mmap may leave the buffer without null terminator if the file size changed // by the time the last page is mapped in, so avoid it if the file size is // likely to change. @@ -504,9 +509,16 @@ getOpenFileImpl(sys::fs::file_t FD, const Twine &Filename, uint64_t FileSize, } #ifdef __MVS__ - // Set codepage auto-conversion for z/OS. - if (auto EC = llvm::enableAutoConversion(FD)) + ErrorOr<bool> NeedConversion = needConversion(Filename.str().c_str(), FD); + if (std::error_code EC = NeedConversion.getError()) return EC; + // File size may increase due to EBCDIC -> UTF-8 conversion, therefore we + // cannot trust the file size and we create the memory buffer by copying + // off the stream. + // Note: This only works with the assumption of reading a full file (i.e, + // Offset == 0 and MapSize == FileSize). Reading a file slice does not work. + if (Offset == 0 && MapSize == FileSize && *NeedConversion) + return getMemoryBufferForStream(FD, Filename); #endif auto Buf = diff --git a/llvm/lib/Support/VirtualFileSystem.cpp b/llvm/lib/Support/VirtualFileSystem.cpp index ce2bf2b4193a5..fde63ddc7e20a 100644 --- a/llvm/lib/Support/VirtualFileSystem.cpp +++ b/llvm/lib/Support/VirtualFileSystem.cpp @@ -326,7 +326,7 @@ ErrorOr<std::unique_ptr<File>> RealFileSystem::openFileForRead(const Twine &Name) { SmallString<256> RealName, Storage; Expected<file_t> FDOrErr = sys::fs::openNativeFileForRead( - adjustPath(Name, Storage), sys::fs::OF_None, &RealName); + adjustPath(Name, Storage), sys::fs::OF_Text, &RealName); if (!FDOrErr) return errorToErrorCode(FDOrErr.takeError()); return std::unique_ptr<File>( _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits