MarcusJohnson91 updated this revision to Diff 359704.
MarcusJohnson91 added a comment.
Herald added subscribers: llvm-commits, dexonsmith, hiraditya.
Herald added a project: LLVM.
Few tweaks since last time, nothing big
CHANGES SINCE LAST ACTION
https://reviews.llvm.org/D103426/new/
https://reviews.llvm.org/D103426
Files:
clang/include/clang/AST/Expr.h
clang/include/clang/AST/FormatString.h
clang/include/clang/AST/Type.h
clang/lib/AST/Expr.cpp
clang/lib/AST/PrintfFormatString.cpp
clang/lib/AST/Type.cpp
clang/lib/Sema/SemaChecking.cpp
llvm/include/llvm/Support/ConvertUTF.h
llvm/lib/Support/ConvertUTFWrapper.cpp
Index: llvm/lib/Support/ConvertUTFWrapper.cpp
===================================================================
--- llvm/lib/Support/ConvertUTFWrapper.cpp
+++ llvm/lib/Support/ConvertUTFWrapper.cpp
@@ -140,6 +140,64 @@
llvm::ArrayRef<char>(reinterpret_cast<const char *>(Src.data()),
Src.size() * sizeof(UTF16)), Out);
}
+
+bool convertUTF32ToUTF8String(ArrayRef<char> SrcBytes, std::string &Out) {
+ assert(Out.empty());
+
+ // Error out on an uneven byte count.
+ if (SrcBytes.size() % 2)
+ return false;
+
+ // Avoid OOB by returning early on empty input.
+ if (SrcBytes.empty())
+ return true;
+
+ const UTF32 *Src = reinterpret_cast<const UTF32 *>(SrcBytes.begin());
+ const UTF32 *SrcEnd = reinterpret_cast<const UTF32 *>(SrcBytes.end());
+
+ assert((uintptr_t)Src % sizeof(UTF32) == 0);
+
+ // Byteswap if necessary.
+ std::vector<UTF32> ByteSwapped;
+ if (Src[0] == UNI_UTF16_BYTE_ORDER_MARK_SWAPPED) {
+ ByteSwapped.insert(ByteSwapped.end(), Src, SrcEnd);
+ for (unsigned I = 0, E = ByteSwapped.size(); I != E; ++I)
+ ByteSwapped[I] = llvm::ByteSwap_32(ByteSwapped[I]);
+ Src = &ByteSwapped[0];
+ SrcEnd = &ByteSwapped[ByteSwapped.size() - 1] + 1;
+ }
+
+ // Skip the BOM for conversion.
+ if (Src[0] == UNI_UTF32_BYTE_ORDER_MARK_NATIVE)
+ Src++;
+
+ // Just allocate enough space up front. We'll shrink it later. Allocate
+ // enough that we can fit a null terminator without reallocating.
+ Out.resize(SrcBytes.size() * UNI_MAX_UTF8_BYTES_PER_CODE_POINT + 1);
+ UTF8 *Dst = reinterpret_cast<UTF8 *>(&Out[0]);
+ UTF8 *DstEnd = Dst + Out.size();
+
+ ConversionResult CR =
+ ConvertUTF32toUTF8(&Src, SrcEnd, &Dst, DstEnd, strictConversion);
+ assert(CR != targetExhausted);
+
+ if (CR != conversionOK) {
+ Out.clear();
+ return false;
+ }
+
+ Out.resize(reinterpret_cast<char *>(Dst) - &Out[0]);
+ Out.push_back(0);
+ Out.pop_back();
+ return true;
+}
+
+bool convertUTF32ToUTF8String(ArrayRef<UTF32> Src, std::string &Out)
+{
+ return convertUTF16ToUTF8String(
+ llvm::ArrayRef<char>(reinterpret_cast<const char *>(Src.data()),
+ Src.size() * sizeof(UTF32)), Out);
+}
bool convertUTF8ToUTF16String(StringRef SrcUTF8,
SmallVectorImpl<UTF16> &DstUTF16) {
Index: llvm/include/llvm/Support/ConvertUTF.h
===================================================================
--- llvm/include/llvm/Support/ConvertUTF.h
+++ llvm/include/llvm/Support/ConvertUTF.h
@@ -122,6 +122,9 @@
#define UNI_UTF16_BYTE_ORDER_MARK_NATIVE 0xFEFF
#define UNI_UTF16_BYTE_ORDER_MARK_SWAPPED 0xFFFE
+
+#define UNI_UTF32_BYTE_ORDER_MARK_NATIVE 0x0000FEFF
+#define UNI_UTF32_BYTE_ORDER_MARK_SWAPPED 0x0000FFFE
typedef enum {
conversionOK, /* conversion successful */
@@ -277,6 +280,24 @@
* \returns true on success
*/
bool convertUTF16ToUTF8String(ArrayRef<UTF16> Src, std::string &Out);
+
+/**
+ * Converts a stream of raw bytes assumed to be UTF32 into a UTF8 std::string.
+ *
+ * \param [in] SrcBytes A buffer of what is assumed to be UTF-32 encoded text.
+ * \param [out] Out Converted UTF-8 is stored here on success.
+ * \returns true on success
+ */
+bool convertUTF32ToUTF8String(ArrayRef<char> SrcBytes, std::string &Out);
+
+/**
+* Converts a UTF32 string into a UTF8 std::string.
+*
+* \param [in] Src A buffer of UTF-32 encoded text.
+* \param [out] Out Converted UTF-8 is stored here on success.
+* \returns true on success
+*/
+bool convertUTF32ToUTF8String(ArrayRef<UTF32> Src, std::string &Out);
/**
* Converts a UTF-8 string into a UTF-16 string with native endianness.
Index: clang/lib/Sema/SemaChecking.cpp
===================================================================
--- clang/lib/Sema/SemaChecking.cpp
+++ clang/lib/Sema/SemaChecking.cpp
@@ -9575,15 +9575,15 @@
// Emit a warning if the string literal is truncated and does not contain an
// embedded null character.
- if (TypeSize < StrRef.size() &&
- StrRef.substr(0, TypeSize).find('\0') == StringRef::npos) {
- CheckFormatHandler::EmitFormatDiagnostic(
- S, inFunctionCall, Args[format_idx],
- S.PDiag(diag::warn_printf_format_string_not_null_terminated),
- FExpr->getBeginLoc(),
- /*IsStringLocation=*/true, OrigFormatExpr->getSourceRange());
- return;
- }
+ if (TypeSize < StrRef.size() &&
+ StrRef.substr(0, TypeSize).find('\0') == StringRef::npos) {
+ CheckFormatHandler::EmitFormatDiagnostic(
+ S, inFunctionCall, Args[format_idx],
+ S.PDiag(diag::warn_printf_format_string_not_null_terminated),
+ FExpr->getBeginLoc(),
+ /*IsStringLocation=*/true, OrigFormatExpr->getSourceRange());
+ return;
+ }
// CHECK: empty format string?
if (StrLen == 0 && numDataArgs > 0) {
Index: clang/lib/AST/Type.cpp
===================================================================
--- clang/lib/AST/Type.cpp
+++ clang/lib/AST/Type.cpp
@@ -1962,18 +1962,6 @@
return false;
}
-bool Type::isType(const std::string TypeName) const {
- QualType Desugar = this->getLocallyUnqualifiedSingleStepDesugaredType();
-
-
- while (!Desugar->isCanonicalUnqualified()) {
- if (Desugar.getAsString() == TypeName) {
- return true;
- }
- Desugar = Desugar->getLocallyUnqualifiedSingleStepDesugaredType();
- }
-}
-
bool Type::isChar8Type() const {
if (const BuiltinType *BT = dyn_cast<BuiltinType>(CanonicalType))
return BT->getKind() == BuiltinType::Char8;
@@ -1985,7 +1973,15 @@
if (BT->getKind() == BuiltinType::Char16)
return true;
if (!LangOpts.CPlusPlus) {
- return isType("char16_t");
+ QualType Desugar = this->getLocallyUnqualifiedSingleStepDesugaredType();
+
+
+ while (!Desugar->isCanonicalUnqualified()) {
+ if (Desugar.getAsString() == "char16_t") {
+ return true;
+ }
+ Desugar = Desugar->getLocallyUnqualifiedSingleStepDesugaredType();
+ }
}
return false;
}
@@ -1995,7 +1991,14 @@
if (BT->getKind() == BuiltinType::Char32)
return true;
if (!LangOpts.CPlusPlus) {
- return isType("char32_t");
+ QualType Desugar = this->getLocallyUnqualifiedSingleStepDesugaredType();
+
+ while (!Desugar->isCanonicalUnqualified()) {
+ if (Desugar.getAsString() == "char32_t") {
+ return true;
+ }
+ Desugar = Desugar->getLocallyUnqualifiedSingleStepDesugaredType();
+ }
}
return false;
}
@@ -2017,7 +2020,7 @@
}
}
if (!LangOpts.CPlusPlus) {
- return isType("char16_t") | isType("char32_t");
+ return isChar16Type(LangOpts) | isChar32Type(LangOpts);
}
return false;
}
Index: clang/lib/AST/PrintfFormatString.cpp
===================================================================
--- clang/lib/AST/PrintfFormatString.cpp
+++ clang/lib/AST/PrintfFormatString.cpp
@@ -643,6 +643,9 @@
"const unichar *");
return ArgType(ArgType::WCStrTy, "wchar_t *");
}
+ if (LM.getKind() == LengthModifier::AsWide) {
+ return ArgType(ArgType::WCStrTy, "wchar_t *");
+ }
if (LM.getKind() == LengthModifier::AsUTF16)
return ArgType(ArgType::Char16Ty, "char16_t *");
if (LM.getKind() == LengthModifier::AsUTF32)
Index: clang/lib/AST/Expr.cpp
===================================================================
--- clang/lib/AST/Expr.cpp
+++ clang/lib/AST/Expr.cpp
@@ -1071,70 +1071,45 @@
std::string Output = "";
char *CString = nullptr;
- switch (StringKind) {
+ switch (getKind()) {
case StringKind::Ascii:
- case StringKind::UTF8:
LLVM_FALLTHROUGH;
+ case StringKind::UTF8:
return getTrailingObjects<char>();
break;
- case StringKind::UTF16:
- std::string Trail16 = getTrailingObjects<char>();
+ case StringKind::UTF16: {
+ std::string Trail16 = "";
+ Trail16 = getTrailingObjects<char>();
ArrayRef<char> ArrayRef16(Trail16.c_str(), Trail16.length());
if (llvm::convertUTF16ToUTF8String(ArrayRef16, Output)) {
CString = new char[Output.size() + 1];
return CString;
}
break;
- case StringKind::UTF32:
- std::string Trail32 = getTrailingObjects<char>();
+ }
+ case StringKind::UTF32: {
+ std::string Trail32 = "";
+ Trail32 = getTrailingObjects<char>();
ArrayRef<char> ArrayRef32(Trail32.c_str(), Trail32.length());
- if (llvm::convertUTF32toUTF8String(ArrayRef32, Output)) {
+ if (llvm::convertUTF32ToUTF8String(ArrayRef32, Output)) {
CString = new char[Output.size() + 1];
return CString;
}
break;
- case StringKind::Wide:
+ }
+ case StringKind::Wide: {
if (llvm::convertWideToUTF8(getStringAsWChar(), Output)) {
CString = new char[Output.size() + 1]; // +1 for terminating NUL
return CString;
}
break;
+ }
}
}
const char *StringLiteral::getStrDataAsChar() const {
- std::string Output = "";
- char *CString = nullptr;
-
- switch (StringKind) {
- case StringKind::Ascii:
- case StringKind::UTF8:
- LLVM_FALLTHROUGH;
- return getTrailingObjects<char>();
- break;
- case StringKind::UTF16:
- std::string Trail16 = getTrailingObjects<char>();
- ArrayRef<char> ArrayRef16(Trail16.c_str(), Trail16.length());
- if (llvm::convertUTF16ToUTF8String(ArrayRef16, Output)) {
- CString = new char[Output.size() + 1];
- return CString;
- }
- break;
- case StringKind::UTF32:
- std::string Trail32 = getTrailingObjects<char>();
- ArrayRef<char> ArrayRef32(Trail32.c_str(), Trail32.length());
- if (llvm::convertUTF32toUTF8String(ArrayRef32, Output)) {
- CString = new char[Output.size() + 1];
- return CString;
- }
- break;
- case StringKind::Wide:
- if (llvm::convertWideToUTF8(getStringAsWChar(), Output)) {
- CString = new char[Output.size() + 1]; // +1 for terminating NUL
- return CString;
- }
- break;
- }
+ const char *ConstString = StringLiteral::getStrDataAsChar();
+ return ConstString;
}
StringLiteral::StringLiteral(const ASTContext &Ctx, StringRef Str,
@@ -1331,6 +1306,8 @@
const LangOptions &Features,
const TargetInfo &Target, unsigned *StartToken,
unsigned *StartTokenByteOffset) const {
+ assert((getKind() == StringLiteral::Ascii || getKind() == StringLiteral::UTF8) &&
+ "Only narrow string literals are currently supported");
// Loop over all of the tokens in this string until we find the one that
// contains the byte we're looking for.
unsigned TokNo = 0;
Index: clang/include/clang/AST/Type.h
===================================================================
--- clang/include/clang/AST/Type.h
+++ clang/include/clang/AST/Type.h
@@ -1972,7 +1972,6 @@
/// Determine whether this type is a scoped enumeration type.
bool isScopedEnumeralType() const;
bool isBooleanType() const;
- bool isType(const std::string TypeName) const;
bool isCharType() const;
bool isChar8Type() const;
bool isWideCharType() const;
Index: clang/include/clang/AST/FormatString.h
===================================================================
--- clang/include/clang/AST/FormatString.h
+++ clang/include/clang/AST/FormatString.h
@@ -80,8 +80,8 @@
AsLongDouble, // 'L'
AsAllocate, // for '%as', GNU extension to C90 scanf
AsMAllocate, // for '%ms', GNU extension to scanf
- AsUTF16, // for '%l16(c|s)', soon to be standardized
- AsUTF32, // for '%l32(c|s)', soon to be standardized
+ AsUTF16, // for '%l16(c|s)', Clang extension
+ AsUTF32, // for '%l32(c|s)', Clang extension
AsWide, // 'w' (MSVCRT, like l but only for c, C, s, S, or Z
AsWideChar = AsLong // for '%ls', only makes sense for printf
};
Index: clang/include/clang/AST/Expr.h
===================================================================
--- clang/include/clang/AST/Expr.h
+++ clang/include/clang/AST/Expr.h
@@ -1775,6 +1775,9 @@
public:
enum StringKind { Ascii, Wide, UTF8, UTF16, UTF32 };
+
+ char *getStrDataAsChar();
+ const char *getStrDataAsChar() const;
private:
unsigned numTrailingObjects(OverloadToken<unsigned>) const { return 1; }
@@ -1786,9 +1789,6 @@
return getByteLength();
}
- char *getStrDataAsChar();
- const char *getStrDataAsChar() const;
-
const uint16_t *getStrDataAsUInt16() const {
return reinterpret_cast<const uint16_t *>(getTrailingObjects<char>());
}
@@ -1850,21 +1850,18 @@
std::u16string getStringAsChar16() const {
assert(getCharByteWidth() == 2 &&
"This function is used in places that assume strings use char16_t");
- //return reinterpret_cast<const char16_t *>(getTrailingObjects<char>());
return std::u16string(reinterpret_cast<const char16_t *>(getTrailingObjects<char>()), reinterpret_cast<const char16_t *>(getTrailingObjects<char>() + getByteLength()));
}
std::u32string getStringAsChar32() const {
assert(getCharByteWidth() == 4 &&
"This function is used in places that assume strings use char32_t");
- //return reinterpret_cast<const char32_t *>(getTrailingObjects<char>());
return std::u32string(reinterpret_cast<const char32_t *>(getTrailingObjects<char>()), reinterpret_cast<const char32_t *>(getTrailingObjects<char>() + getByteLength()));
}
std::wstring getStringAsWChar() const {
assert((getCharByteWidth() == 2 || getCharByteWidth() == 4) &&
"This function is used in places that assume strings use wchar_t");
- //return reinterpret_cast<const wchar_t *>(getTrailingObjects<char>());
return std::wstring(reinterpret_cast<const wchar_t *>(getTrailingObjects<char>()), reinterpret_cast<const wchar_t *>(getTrailingObjects<char>() + getByteLength()));
}
_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits