https://github.com/cor3ntin updated https://github.com/llvm/llvm-project/pull/138708
>From c23bf23ddc8e1c8f50a57fcaf74682e86d8ade16 Mon Sep 17 00:00:00 2001 From: Corentin Jabot <corentinja...@gmail.com> Date: Tue, 6 May 2025 17:14:35 +0200 Subject: [PATCH 1/8] [Clang] Add warnings when mixing different charN_t types charN_t represent code units of different UTF encodings. Therefore the values of 2 different charN_t objects do not represent the same characters. In order to avoid comparing apples and oranges, we add new warnings to warn on: - Implicit conversions - Comparisons - Other cases involving arithmetic conversions We only produce the warning if we cannot establish the comparison would be safe through constant evaluation. The new `-Wimplicit-unicode-conversion` warning is enabled by default. Note that this PR intentionally doesn;t touches char/wchar_t, but it would be worth considering also warning on extending the new warnings to these types (in a follow up) Additionally most arithmetic operations on charN_t don't really make sense (ie what does it mean to addition code units), so we could add warnings for that. Fixes #138526 --- clang/docs/ReleaseNotes.rst | 4 + clang/include/clang/AST/ASTDiagnostic.h | 3 + clang/include/clang/AST/Type.h | 1 + clang/include/clang/Basic/DiagnosticGroups.td | 1 + .../clang/Basic/DiagnosticSemaKinds.td | 25 +++ clang/lib/AST/ASTDiagnostic.cpp | 29 ++++ clang/lib/AST/Type.cpp | 14 ++ clang/lib/Sema/SemaChecking.cpp | 48 ++++++ clang/lib/Sema/SemaExpr.cpp | 71 ++++++++ .../warn-implicit-unicode-conversions.cpp | 155 ++++++++++++++++++ llvm/include/llvm/Support/ConvertUTF.h | 4 + llvm/lib/Support/ConvertUTFWrapper.cpp | 10 ++ 12 files changed, 365 insertions(+) create mode 100644 clang/test/SemaCXX/warn-implicit-unicode-conversions.cpp diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 203958dab7430..3a42f43d79fd1 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -503,6 +503,10 @@ Improvements to Clang's diagnostics - ``-Wreserved-identifier`` now fires on reserved parameter names in a function declaration which is not a definition. +- A new ``-Wimplicit-unicode-conversion`` warns where comparing or implicitly converting + between different Unicode character types (``char8_t``, ``char16_t``, ``char32_t``). + This warning only triggers in C++ as these types are aliases in C. (#GH138526) + Improvements to Clang's time-trace ---------------------------------- diff --git a/clang/include/clang/AST/ASTDiagnostic.h b/clang/include/clang/AST/ASTDiagnostic.h index ef22249828629..baa410e3e4a03 100644 --- a/clang/include/clang/AST/ASTDiagnostic.h +++ b/clang/include/clang/AST/ASTDiagnostic.h @@ -38,6 +38,9 @@ namespace clang { /// is initialized before passing it in. QualType desugarForDiagnostic(ASTContext &Context, QualType QT, bool &ShouldAKA); + + std::string FormatUTFCodeUnitAsCodepoint(unsigned Value, QualType T); + } // end namespace clang #endif diff --git a/clang/include/clang/AST/Type.h b/clang/include/clang/AST/Type.h index 02a6fb5333538..7fca11fb708cf 100644 --- a/clang/include/clang/AST/Type.h +++ b/clang/include/clang/AST/Type.h @@ -2524,6 +2524,7 @@ class alignas(TypeAlignment) Type : public ExtQualsTypeCommonBase { bool isChar16Type() const; bool isChar32Type() const; bool isAnyCharacterType() const; + bool isUnicodeCharacterType() const; bool isIntegralType(const ASTContext &Ctx) const; /// Determine whether this type is an integral or enumeration type. diff --git a/clang/include/clang/Basic/DiagnosticGroups.td b/clang/include/clang/Basic/DiagnosticGroups.td index 1faf8508121f4..e5b5dbbd07f10 100644 --- a/clang/include/clang/Basic/DiagnosticGroups.td +++ b/clang/include/clang/Basic/DiagnosticGroups.td @@ -111,6 +111,7 @@ def EnumConversion : DiagGroup<"enum-conversion", ImplicitEnumEnumCast, EnumFloatConversion, EnumCompareConditional]>; +def ImplicitUnicodeConversion : DiagGroup<"implicit-unicode-conversion">; def DeprecatedOFast : DiagGroup<"deprecated-ofast">; def ObjCSignedCharBoolImplicitIntConversion : DiagGroup<"objc-signed-char-bool-implicit-int-conversion">; diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td index e5a7cdc14a737..a018f6693cff2 100644 --- a/clang/include/clang/Basic/DiagnosticSemaKinds.td +++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -4357,6 +4357,26 @@ def warn_address_of_reference_bool_conversion : Warning< "code; pointer may be assumed to always convert to true">, InGroup<UndefinedBoolConversion>; +def warn_impcast_unicode_char_type : Warning< + "implicit conversion from %0 to %1 may change the meaning of the represented code unit">, + InGroup<ImplicitUnicodeConversion>; +def warn_impcast_unicode_precision : Warning< + "implicit conversion from %0 to %1 may lose precision and change the meaning of the represented code unit">, + InGroup<ImplicitUnicodeConversion>; +def warn_impcast_unicode_char_type_constant + : Warning<"implicit conversion from %0 to %1 changes the meaning of the " + "%select{code unit|codepoint}2 '%3'">, + InGroup<ImplicitUnicodeConversion>; + +def warn_comparison_unicode_mixed_types : Warning< + "comparing values of different Unicode code unit types %0 and %1 may compare different codepoints">, + InGroup<ImplicitUnicodeConversion>; + +def warn_comparison_unicode_mixed_types_constant + : Warning<"comparing values of different Unicode code unit types %0 and %1 " + "compares unrelated code units '%2' and '%3'">, + InGroup<ImplicitUnicodeConversion>; + def warn_xor_used_as_pow : Warning< "result of '%0' is %1; did you mean exponentiation?">, InGroup<XorUsedAsPow>; @@ -7719,6 +7739,11 @@ def warn_comparison_of_mixed_enum_types_switch : Warning< "%diff{ ($ and $)|}0,1">, InGroup<EnumCompareSwitch>; +def warn_arith_conv_mixed__unicode_types + : Warning<"%sub{select_arith_conv_kind}0 " + "different Unicode character types %1 and %2">, + InGroup<ImplicitUnicodeConversion>; + def err_typecheck_assign_const : Error< "%select{" "cannot assign to return value because function %1 returns a const value|" diff --git a/clang/lib/AST/ASTDiagnostic.cpp b/clang/lib/AST/ASTDiagnostic.cpp index 6cb09b0492ac9..0c9f50fb1a01c 100644 --- a/clang/lib/AST/ASTDiagnostic.cpp +++ b/clang/lib/AST/ASTDiagnostic.cpp @@ -20,6 +20,8 @@ #include "clang/AST/TemplateBase.h" #include "clang/AST/Type.h" #include "llvm/ADT/StringExtras.h" +#include "llvm/Support/ConvertUTF.h" +#include "llvm/Support/Format.h" #include "llvm/Support/raw_ostream.h" using namespace clang; @@ -2190,3 +2192,30 @@ static bool FormatTemplateTypeDiff(ASTContext &Context, QualType FromType, TD.DiffTemplate(); return TD.Emit(); } + +std::string clang::FormatUTFCodeUnitAsCodepoint(unsigned Value, QualType T) { + auto IsSingleCodeUnitCP = [](unsigned Value, QualType T) { + if (T->isChar8Type()) { + assert(Value <= 0xFF && "not a valid UTF-8 code unit"); + return Value <= 0x7F; + } + if (T->isChar16Type()) { + assert(Value <= 0xFFFF && "not a valid UTF-16 code unit"); + return llvm::IsSingleCodeUnitUTF16Codepoint(Value); + } + return llvm::IsSingleCodeUnitUTF32Codepoint(Value); + }; + llvm::SmallVector<char, 4> Str; + if (!IsSingleCodeUnitCP(Value, T)) { + llvm::raw_svector_ostream OS(Str); + OS << "<" << llvm::format_hex(Value, 1, /*Upper=*/true) << ">"; + return std::string(Str.begin(), Str.end()); + } + + char Buffer[UNI_MAX_UTF8_BYTES_PER_CODE_POINT]; + char *Ptr = Buffer; + [[maybe_unused]] bool Converted = llvm::ConvertCodePointToUTF8(Value, Ptr); + assert(Converted && "trying to encode invalid code unit"); + EscapeStringForDiagnostic(StringRef(Buffer, Ptr - Buffer), Str); + return std::string(Str.begin(), Str.end()); +} diff --git a/clang/lib/AST/Type.cpp b/clang/lib/AST/Type.cpp index fbd09141bc541..2da63b13faf9d 100644 --- a/clang/lib/AST/Type.cpp +++ b/clang/lib/AST/Type.cpp @@ -2193,6 +2193,20 @@ bool Type::isAnyCharacterType() const { } } +bool Type::isUnicodeCharacterType() const { + const auto *BT = dyn_cast<BuiltinType>(CanonicalType); + if (!BT) + return false; + switch (BT->getKind()) { + default: + return false; + case BuiltinType::Char8: + case BuiltinType::Char16: + case BuiltinType::Char32: + return true; + } +} + /// isSignedIntegerType - Return true if this is an integer type that is /// signed, according to C99 6.2.5p4 [char, signed char, short, int, long..], /// an enum decl which has a signed representation diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp index 97f623f61a405..d12b5cea37aa6 100644 --- a/clang/lib/Sema/SemaChecking.cpp +++ b/clang/lib/Sema/SemaChecking.cpp @@ -14,6 +14,7 @@ #include "CheckExprLifetime.h" #include "clang/AST/APValue.h" #include "clang/AST/ASTContext.h" +#include "clang/AST/ASTDiagnostic.h" #include "clang/AST/Attr.h" #include "clang/AST/AttrIterator.h" #include "clang/AST/CharUnits.h" @@ -11810,6 +11811,46 @@ static void DiagnoseIntInBoolContext(Sema &S, Expr *E) { } } +static void DiagnoseMixedUnicodeImplicitConversion(Sema &S, const Type *Source, + const Type *Target, Expr *E, + QualType T, + SourceLocation CC) { + assert(Source->isUnicodeCharacterType() && Target->isUnicodeCharacterType() && + Source != Target); + Expr::EvalResult Result; + if (E->EvaluateAsInt(Result, S.getASTContext(), Expr::SE_AllowSideEffects, + S.isConstantEvaluatedContext())) { + llvm::APSInt Value(32); + Value = Result.Val.getInt(); + bool IsASCII = Value <= 0x7F; + bool IsBMP = Value <= 0xD7FF || (Value >= 0xE000 && Value <= 0xFFFF); + bool ConversionPreservesSemantics = + IsASCII || (!Source->isChar8Type() && !Target->isChar8Type() && IsBMP); + + if (!ConversionPreservesSemantics) { + auto IsSingleCodeUnitCP = [](const QualType &T, + const llvm::APSInt &Value) { + if (T->isChar8Type()) + return llvm::IsSingleCodeUnitUTF8Codepoint(Value.getExtValue()); + if (T->isChar16Type()) + return llvm::IsSingleCodeUnitUTF16Codepoint(Value.getExtValue()); + return llvm::IsSingleCodeUnitUTF32Codepoint(Value.getExtValue()); + }; + + S.Diag(CC, diag::warn_impcast_unicode_char_type_constant) + << E->getType() << T + << IsSingleCodeUnitCP(E->getType().getUnqualifiedType(), Value) + << FormatUTFCodeUnitAsCodepoint(Value.getExtValue(), E->getType()); + } + } else { + bool LosesPrecision = S.getASTContext().getIntWidth(E->getType()) > + S.getASTContext().getIntWidth(T); + DiagnoseImpCast(S, E, T, CC, + LosesPrecision ? diag::warn_impcast_unicode_precision + : diag::warn_impcast_unicode_char_type); + } +} + void Sema::CheckImplicitConversion(Expr *E, QualType T, SourceLocation CC, bool *ICContext, bool IsListInit) { if (E->isTypeDependent() || E->isValueDependent()) return; @@ -12147,6 +12188,13 @@ void Sema::CheckImplicitConversion(Expr *E, QualType T, SourceLocation CC, DiscardMisalignedMemberAddress(Target, E); + + if(Source->isUnicodeCharacterType() && Target->isUnicodeCharacterType()) { + DiagnoseMixedUnicodeImplicitConversion(*this, Source, Target, E, T, CC); + return; + } + + if (Target->isBooleanType()) DiagnoseIntInBoolContext(*this, E); diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp index be3f145f3c5f1..b0080b778db61 100644 --- a/clang/lib/Sema/SemaExpr.cpp +++ b/clang/lib/Sema/SemaExpr.cpp @@ -15,6 +15,7 @@ #include "UsedDeclVisitor.h" #include "clang/AST/ASTConsumer.h" #include "clang/AST/ASTContext.h" +#include "clang/AST/ASTDiagnostic.h" #include "clang/AST/ASTLambda.h" #include "clang/AST/ASTMutationListener.h" #include "clang/AST/CXXInheritance.h" @@ -1567,6 +1568,72 @@ void Sema::checkEnumArithmeticConversions(Expr *LHS, Expr *RHS, } } +static void CheckUnicodeArithmeticConversions(Sema & SemaRef, + Expr *LHS, + Expr *RHS, + SourceLocation Loc, + ArithConvKind ACK) { + QualType LHSType = LHS->getType().getUnqualifiedType(); + QualType RHSType = RHS->getType().getUnqualifiedType(); + + if(!SemaRef.getLangOpts().CPlusPlus || + !LHSType->isUnicodeCharacterType() || !RHSType->isUnicodeCharacterType()) + return; + + if(ACK == ArithConvKind::Comparison) { + if (SemaRef.getASTContext().hasSameType(LHSType, RHSType)) + return; + + Expr::EvalResult LHSRes, RHSRes; + bool Success = LHS->EvaluateAsInt(LHSRes, SemaRef.getASTContext(), + Expr::SE_AllowSideEffects, + SemaRef.isConstantEvaluatedContext()); + if (Success) + Success = RHS->EvaluateAsInt(RHSRes, SemaRef.getASTContext(), + Expr::SE_AllowSideEffects, + SemaRef.isConstantEvaluatedContext()); + if (Success) { + llvm::APSInt LHSValue(32); + LHSValue = LHSRes.Val.getInt(); + llvm::APSInt RHSValue(32); + RHSValue = RHSRes.Val.getInt(); + + auto IsSingleCodeUnitCP = [](const QualType &T, + const llvm::APSInt &Value) { + if (T->isChar8Type()) + return llvm::IsSingleCodeUnitUTF8Codepoint(Value.getExtValue()); + if (T->isChar16Type()) + return llvm::IsSingleCodeUnitUTF16Codepoint(Value.getExtValue()); + return llvm::IsSingleCodeUnitUTF32Codepoint(Value.getExtValue()); + }; + + bool LHSSafe = IsSingleCodeUnitCP(LHSType, LHSValue); + bool RHSSafe = IsSingleCodeUnitCP(RHSType, RHSValue); + if (LHSSafe && RHSSafe) + return; + + SemaRef.Diag(Loc, diag::warn_comparison_unicode_mixed_types_constant) + << LHS->getSourceRange() << RHS->getSourceRange() << LHSType + << RHSType + << FormatUTFCodeUnitAsCodepoint(LHSValue.getExtValue(), LHSType) + << FormatUTFCodeUnitAsCodepoint(RHSValue.getExtValue(), RHSType); + return; + } + SemaRef.Diag(Loc, diag::warn_comparison_unicode_mixed_types) + << LHS->getSourceRange() << RHS->getSourceRange() + << LHSType << RHSType; + return; + } + + if (SemaRef.getASTContext().hasSameType(LHSType, RHSType)) + return; + + SemaRef.Diag(Loc, diag::warn_arith_conv_mixed__unicode_types) + << LHS->getSourceRange() << RHS->getSourceRange() << ACK << LHSType + << RHSType; + return; +} + /// UsualArithmeticConversions - Performs various conversions that are common to /// binary operators (C99 6.3.1.8). If both operands aren't arithmetic, this /// routine returns the first non-arithmetic type found. The client is @@ -1574,8 +1641,12 @@ void Sema::checkEnumArithmeticConversions(Expr *LHS, Expr *RHS, QualType Sema::UsualArithmeticConversions(ExprResult &LHS, ExprResult &RHS, SourceLocation Loc, ArithConvKind ACK) { + checkEnumArithmeticConversions(LHS.get(), RHS.get(), Loc, ACK); + CheckUnicodeArithmeticConversions(*this, LHS.get(), RHS.get(), + Loc, ACK); + if (ACK != ArithConvKind::CompAssign) { LHS = UsualUnaryConversions(LHS.get()); if (LHS.isInvalid()) diff --git a/clang/test/SemaCXX/warn-implicit-unicode-conversions.cpp b/clang/test/SemaCXX/warn-implicit-unicode-conversions.cpp new file mode 100644 index 0000000000000..41794b15175b5 --- /dev/null +++ b/clang/test/SemaCXX/warn-implicit-unicode-conversions.cpp @@ -0,0 +1,155 @@ +// RUN: %clang_cc1 -verify -fsyntax-only -std=c++20 -Wconversion %s + +void c8(char8_t); +void c16(char16_t); +void c32(char32_t); + +void test(char8_t u8, char16_t u16, char32_t u32) { + c8(u8); + c8(u16); // expected-warning {{implicit conversion from 'char16_t' to 'char8_t' may lose precision and change the meaning of the represented code unit}} + c8(u32); // expected-warning {{implicit conversion from 'char32_t' to 'char8_t' may lose precision and change the meaning of the represented code unit}} + + c16(u8); // expected-warning {{implicit conversion from 'char8_t' to 'char16_t' may change the meaning of the represented code unit}} + c16(u16); + c16(u32); // expected-warning {{implicit conversion from 'char32_t' to 'char16_t' may lose precision and change the meaning of the represented code unit}} + + c32(u8); // expected-warning {{implicit conversion from 'char8_t' to 'char32_t' may change the meaning of the represented code unit}} + c32(u16); // expected-warning {{implicit conversion from 'char16_t' to 'char32_t' may change the meaning of the represented code unit}} + c32(u32); + + + c8(char32_t(0x7f)); + c8(char32_t(0x80)); // expected-warning {{implicit conversion from 'char32_t' to 'char8_t' changes the meaning of the codepoint '<U+0080>'}} + + c8(char16_t(0x7f)); + c8(char16_t(0x80)); // expected-warning {{implicit conversion from 'char16_t' to 'char8_t' changes the meaning of the codepoint '<U+0080>'}} + c8(char16_t(0xD800)); // expected-warning {{implicit conversion from 'char16_t' to 'char8_t' changes the meaning of the code unit '<0xD800>'}} + c8(char16_t(0xE000)); // expected-warning {{implicit conversion from 'char16_t' to 'char8_t' changes the meaning of the codepoint '<U+E000>'}} + + + c16(char32_t(0x7f)); + c16(char32_t(0x80)); + c16(char32_t(0xD7FF)); + c16(char32_t(0xD800)); // expected-warning {{implicit conversion from 'char32_t' to 'char16_t' changes the meaning of the code unit '<0xD800>'}} + c16(char32_t(0xE000)); + c16(char32_t(U'🐉')); // expected-warning {{implicit conversion from 'char32_t' to 'char16_t' changes the meaning of the codepoint '🐉'}} + + + c32(char8_t(0x7f)); + c32(char8_t(0x80)); // expected-warning {{implicit conversion from 'char8_t' to 'char32_t' changes the meaning of the code unit '<0x80>'}} + c32(char8_t(0xFF)); // expected-warning {{implicit conversion from 'char8_t' to 'char32_t' changes the meaning of the code unit '<0xFF>'}} + + + c32(char16_t(0x7f)); + c32(char16_t(0x80)); + + c32(char16_t(0xD7FF)); + c32(char16_t(0xD800)); // expected-warning {{implicit conversion from 'char16_t' to 'char32_t' changes the meaning of the code unit '<0xD800>'}} + c32(char16_t(0xDFFF)); // expected-warning {{implicit conversion from 'char16_t' to 'char32_t' changes the meaning of the code unit '<0xDFFF>'}} + c32(char16_t(0xE000)); + c32(char16_t(u'☕')); + + (void)static_cast<char32_t>(char8_t(0x80)); // sanity check: no explicit conversion; + + using Char8 = char8_t; + Char8 c81 = u16; // expected-warning {{implicit conversion from 'char16_t' to 'Char8' (aka 'char8_t') may lose precision and change the meaning of the represented code unit}} + + [[maybe_unused]] char c = u16; // expected-warning {{implicit conversion loses integer precision: 'char16_t' to 'char'}} + + // FIXME: We should apply the same logic to wchar + [[maybe_unused]] wchar_t wc = u16; + [[maybe_unused]] wchar_t wc2 = u8; +} + +void test_comp(char8_t u8, char16_t u16, char32_t u32) { + (void)(u8 == u8' '); + (void)(u8 == u' '); // expected-warning{{comparing values of different Unicode code unit types 'char8_t' and 'char16_t' may compare different codepoints}} + (void)(u8 == U' '); // expected-warning{{comparing values of different Unicode code unit types 'char8_t' and 'char32_t' may compare different codepoints}} + + (void)(u16 == u8' '); // expected-warning{{comparing values of different Unicode code unit types 'char16_t' and 'char8_t' may compare different codepoints}} + (void)(u16 == u' '); + (void)(u16 == U' '); // expected-warning{{comparing values of different Unicode code unit types 'char16_t' and 'char32_t' may compare different codepoints}} + + (void)(u32 == u8' '); // expected-warning{{comparing values of different Unicode code unit types 'char32_t' and 'char8_t' may compare different codepoints}} + (void)(u32 == u' '); // expected-warning{{comparing values of different Unicode code unit types 'char32_t' and 'char16_t' may compare different codepoints}} + (void)(u32 == U' '); + + + (void)(u8' ' == u' '); + (void)(u8' ' == u' '); + + + (void)(u8 == U' '); // expected-warning{{comparing values of different Unicode code unit types 'char8_t' and 'char32_t' may compare different codepoints}} + (void)(u16 == u8' '); // expected-warning{{comparing values of different Unicode code unit types 'char16_t' and 'char8_t' may compare different codepoints}} + (void)(u16 == u' '); + (void)(u16 == U' '); // expected-warning{{comparing values of different Unicode code unit types 'char16_t' and 'char32_t' may compare different codepoints}} + + (void)(u32 == u8' '); // expected-warning{{comparing values of different Unicode code unit types 'char32_t' and 'char8_t' may compare different codepoints}} + (void)(u32 == u' '); // expected-warning{{comparing values of different Unicode code unit types 'char32_t' and 'char16_t' may compare different codepoints}} + (void)(u32 == U' '); + + + (void)(char8_t(0x7f) == char8_t(0x7f)); + (void)(char8_t(0x7f) == char16_t(0x7f)); + (void)(char8_t(0x7f) == char32_t(0x7f)); + + (void)(char8_t(0x80) == char8_t(0x80)); + (void)(char8_t(0x80) == char16_t(0x80)); // expected-warning{{comparing values of different Unicode code unit types 'char8_t' and 'char16_t' compares unrelated code units '<0x80>' and '<U+0080>}} + (void)(char8_t(0x80) == char32_t(0x80)); // expected-warning{{comparing values of different Unicode code unit types 'char8_t' and 'char32_t' compares unrelated code units '<0x80>' and '<U+0080>}} + + (void)(char8_t(0x80) == char8_t(0x7f)); + (void)(char8_t(0x80) == char16_t(0x7f)); // expected-warning{{comparing values of different Unicode code unit types 'char8_t' and 'char16_t' compares unrelated code units '<0x80>' and '<U+007F>'}} + (void)(char8_t(0x80) == char32_t(0x7f)); // expected-warning{{comparing values of different Unicode code unit types 'char8_t' and 'char32_t' compares unrelated code units '<0x80>' and '<U+007F>'}} + + + (void)(char16_t(0x7f) < char8_t(0x7f)); + (void)(char16_t(0x7f) < char16_t(0x7f)); + (void)(char16_t(0x7f) < char32_t(0x7f)); + + (void)(char16_t(0x80) < char8_t(0x80)); // expected-warning{{comparing values of different Unicode code unit types 'char16_t' and 'char8_t' compares unrelated code units '<U+0080>' and '<0x80>'}} + (void)(char16_t(0x80) < char16_t(0x80)); + (void)(char16_t(0x80) < char32_t(0x80)); + + (void)(char16_t(0x80) == char8_t(0x7f)); + (void)(char16_t(0x80) < char16_t(0x7f)); + (void)(char16_t(0x80) < char32_t(0x7f)); + + + (void)(char32_t(0x7f) < char8_t(0x7f)); + (void)(char32_t(0x7f) < char16_t(0x7f)); + (void)(char32_t(0x7f) < char32_t(0x7f)); + + (void)(char32_t(0x80) < char8_t(0x80)); // expected-warning{{comparing values of different Unicode code unit types 'char32_t' and 'char8_t' compares unrelated code units '<U+0080>' and '<0x80>'}} + (void)(char32_t(0x80) < char16_t(0x80)); + (void)(char32_t(0x80) < char32_t(0x80)); + + (void)(char32_t(0x80) == char8_t(0x7f)); + (void)(char32_t(0x80) < char16_t(0x7f)); + (void)(char32_t(0x80) < char32_t(0x7f)); + + + (void)(char32_t(U'🐉') <= char16_t(0xD800)); // expected-warning{{comparing values of different Unicode code unit types 'char32_t' and 'char16_t' compares unrelated code units '🐉' and '<0xD800>'}} + (void)(char32_t(U'🐉') <= char16_t(0xD7FF)); + + (void)(char16_t(0xD800) >= char32_t(U'🐉')); // expected-warning{{comparing values of different Unicode code unit types 'char16_t' and 'char32_t' compares unrelated code units '<0xD800>' and '🐉'}} + (void)(char16_t(0xD7FF) >= char32_t(U'🐉')); +} + +void check_arithmetic(char8_t u8, char16_t u16, char32_t u32) { + + (void)(u8 + u8); + (void)(u16 += u16); + (void)(u32 & u32); + (void)(1 ? u16 : u16); + + (void)(u8 + u16); // expected-warning {{arithmetic between different Unicode character types 'char8_t' and 'char16_t'}} + (void)(u8 += u16); // expected-warning {{compound assignment of different Unicode character types 'char8_t' and 'char16_t'}} + (void)(u8 & u16); // expected-warning {{bitwise operation between different Unicode character types 'char8_t' and 'char16_t'}} + (void)(1 ? u8 : u16); // expected-warning {{conditional expression between different Unicode character types 'char8_t' and 'char16_t'}} + + + (void)(u16 * u32); // expected-warning {{arithmetic between different Unicode character types 'char16_t' and 'char32_t'}} + (void)(u16 -= u32); // expected-warning {{compound assignment of different Unicode character types 'char16_t' and 'char32_t'}} + (void)(u16 | u32); // expected-warning {{bitwise operation between different Unicode character types 'char16_t' and 'char32_t'}} + (void)(1 ? u32 : u16); // expected-warning {{conditional expression between different Unicode character types 'char32_t' and 'char16_t'}} +} diff --git a/llvm/include/llvm/Support/ConvertUTF.h b/llvm/include/llvm/Support/ConvertUTF.h index 25d46178457d6..e30b3ee68364e 100644 --- a/llvm/include/llvm/Support/ConvertUTF.h +++ b/llvm/include/llvm/Support/ConvertUTF.h @@ -328,6 +328,10 @@ bool convertUTF32ToUTF8String(ArrayRef<UTF32> Src, std::string &Out); bool convertUTF8ToUTF16String(StringRef SrcUTF8, SmallVectorImpl<UTF16> &DstUTF16); +bool IsSingleCodeUnitUTF8Codepoint(unsigned); +bool IsSingleCodeUnitUTF16Codepoint(unsigned); +bool IsSingleCodeUnitUTF32Codepoint(unsigned); + #if defined(_WIN32) namespace sys { namespace windows { diff --git a/llvm/lib/Support/ConvertUTFWrapper.cpp b/llvm/lib/Support/ConvertUTFWrapper.cpp index 4952fe65d7767..76ead00c977bd 100644 --- a/llvm/lib/Support/ConvertUTFWrapper.cpp +++ b/llvm/lib/Support/ConvertUTFWrapper.cpp @@ -303,5 +303,15 @@ bool convertWideToUTF8(const std::wstring &Source, std::string &Result) { } } +bool IsSingleCodeUnitUTF8Codepoint(unsigned V) { return V <= 0x7F; } + +bool IsSingleCodeUnitUTF16Codepoint(unsigned V) { + return V <= 0xD7FF || (V >= 0xE000 && V <= 0xFFFF); +} + +bool IsSingleCodeUnitUTF32Codepoint(unsigned V) { + return V <= 0xD7FF || (V >= 0xE000 && V <= 0x10FFFF); +} + } // end namespace llvm >From 513b292ad18da9c33968f1cc22f71e9256cfebfc Mon Sep 17 00:00:00 2001 From: Corentin Jabot <corentinja...@gmail.com> Date: Tue, 6 May 2025 17:38:28 +0200 Subject: [PATCH 2/8] format --- .../clang/Basic/DiagnosticSemaKinds.td | 21 ++-- clang/lib/Sema/SemaChecking.cpp | 4 +- clang/lib/Sema/SemaExpr.cpp | 112 +++++++++--------- 3 files changed, 67 insertions(+), 70 deletions(-) diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td index a018f6693cff2..9cd5d3d36b928 100644 --- a/clang/include/clang/Basic/DiagnosticSemaKinds.td +++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -4357,20 +4357,23 @@ def warn_address_of_reference_bool_conversion : Warning< "code; pointer may be assumed to always convert to true">, InGroup<UndefinedBoolConversion>; -def warn_impcast_unicode_char_type : Warning< - "implicit conversion from %0 to %1 may change the meaning of the represented code unit">, - InGroup<ImplicitUnicodeConversion>; -def warn_impcast_unicode_precision : Warning< - "implicit conversion from %0 to %1 may lose precision and change the meaning of the represented code unit">, - InGroup<ImplicitUnicodeConversion>; +def warn_impcast_unicode_char_type + : Warning<"implicit conversion from %0 to %1 may change the meaning of the " + "represented code unit">, + InGroup<ImplicitUnicodeConversion>; +def warn_impcast_unicode_precision + : Warning<"implicit conversion from %0 to %1 may lose precision and change " + "the meaning of the represented code unit">, + InGroup<ImplicitUnicodeConversion>; def warn_impcast_unicode_char_type_constant : Warning<"implicit conversion from %0 to %1 changes the meaning of the " "%select{code unit|codepoint}2 '%3'">, InGroup<ImplicitUnicodeConversion>; -def warn_comparison_unicode_mixed_types : Warning< - "comparing values of different Unicode code unit types %0 and %1 may compare different codepoints">, - InGroup<ImplicitUnicodeConversion>; +def warn_comparison_unicode_mixed_types + : Warning<"comparing values of different Unicode code unit types %0 and %1 " + "may compare different codepoints">, + InGroup<ImplicitUnicodeConversion>; def warn_comparison_unicode_mixed_types_constant : Warning<"comparing values of different Unicode code unit types %0 and %1 " diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp index d12b5cea37aa6..9361683ff4a8c 100644 --- a/clang/lib/Sema/SemaChecking.cpp +++ b/clang/lib/Sema/SemaChecking.cpp @@ -12188,13 +12188,11 @@ void Sema::CheckImplicitConversion(Expr *E, QualType T, SourceLocation CC, DiscardMisalignedMemberAddress(Target, E); - - if(Source->isUnicodeCharacterType() && Target->isUnicodeCharacterType()) { + if (Source->isUnicodeCharacterType() && Target->isUnicodeCharacterType()) { DiagnoseMixedUnicodeImplicitConversion(*this, Source, Target, E, T, CC); return; } - if (Target->isBooleanType()) DiagnoseIntInBoolContext(*this, E); diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp index b0080b778db61..a7a7f55f3d34f 100644 --- a/clang/lib/Sema/SemaExpr.cpp +++ b/clang/lib/Sema/SemaExpr.cpp @@ -1568,70 +1568,67 @@ void Sema::checkEnumArithmeticConversions(Expr *LHS, Expr *RHS, } } -static void CheckUnicodeArithmeticConversions(Sema & SemaRef, - Expr *LHS, - Expr *RHS, - SourceLocation Loc, - ArithConvKind ACK) { - QualType LHSType = LHS->getType().getUnqualifiedType(); - QualType RHSType = RHS->getType().getUnqualifiedType(); - - if(!SemaRef.getLangOpts().CPlusPlus || - !LHSType->isUnicodeCharacterType() || !RHSType->isUnicodeCharacterType()) - return; +static void CheckUnicodeArithmeticConversions(Sema &SemaRef, Expr *LHS, + Expr *RHS, SourceLocation Loc, + ArithConvKind ACK) { + QualType LHSType = LHS->getType().getUnqualifiedType(); + QualType RHSType = RHS->getType().getUnqualifiedType(); + + if (!SemaRef.getLangOpts().CPlusPlus || !LHSType->isUnicodeCharacterType() || + !RHSType->isUnicodeCharacterType()) + return; - if(ACK == ArithConvKind::Comparison) { - if (SemaRef.getASTContext().hasSameType(LHSType, RHSType)) - return; + if (ACK == ArithConvKind::Comparison) { + if (SemaRef.getASTContext().hasSameType(LHSType, RHSType)) + return; - Expr::EvalResult LHSRes, RHSRes; - bool Success = LHS->EvaluateAsInt(LHSRes, SemaRef.getASTContext(), - Expr::SE_AllowSideEffects, - SemaRef.isConstantEvaluatedContext()); - if (Success) - Success = RHS->EvaluateAsInt(RHSRes, SemaRef.getASTContext(), - Expr::SE_AllowSideEffects, - SemaRef.isConstantEvaluatedContext()); - if (Success) { - llvm::APSInt LHSValue(32); - LHSValue = LHSRes.Val.getInt(); - llvm::APSInt RHSValue(32); - RHSValue = RHSRes.Val.getInt(); - - auto IsSingleCodeUnitCP = [](const QualType &T, - const llvm::APSInt &Value) { - if (T->isChar8Type()) - return llvm::IsSingleCodeUnitUTF8Codepoint(Value.getExtValue()); - if (T->isChar16Type()) - return llvm::IsSingleCodeUnitUTF16Codepoint(Value.getExtValue()); - return llvm::IsSingleCodeUnitUTF32Codepoint(Value.getExtValue()); - }; - - bool LHSSafe = IsSingleCodeUnitCP(LHSType, LHSValue); - bool RHSSafe = IsSingleCodeUnitCP(RHSType, RHSValue); - if (LHSSafe && RHSSafe) - return; + Expr::EvalResult LHSRes, RHSRes; + bool Success = LHS->EvaluateAsInt(LHSRes, SemaRef.getASTContext(), + Expr::SE_AllowSideEffects, + SemaRef.isConstantEvaluatedContext()); + if (Success) + Success = RHS->EvaluateAsInt(RHSRes, SemaRef.getASTContext(), + Expr::SE_AllowSideEffects, + SemaRef.isConstantEvaluatedContext()); + if (Success) { + llvm::APSInt LHSValue(32); + LHSValue = LHSRes.Val.getInt(); + llvm::APSInt RHSValue(32); + RHSValue = RHSRes.Val.getInt(); + + auto IsSingleCodeUnitCP = [](const QualType &T, + const llvm::APSInt &Value) { + if (T->isChar8Type()) + return llvm::IsSingleCodeUnitUTF8Codepoint(Value.getExtValue()); + if (T->isChar16Type()) + return llvm::IsSingleCodeUnitUTF16Codepoint(Value.getExtValue()); + return llvm::IsSingleCodeUnitUTF32Codepoint(Value.getExtValue()); + }; - SemaRef.Diag(Loc, diag::warn_comparison_unicode_mixed_types_constant) - << LHS->getSourceRange() << RHS->getSourceRange() << LHSType - << RHSType - << FormatUTFCodeUnitAsCodepoint(LHSValue.getExtValue(), LHSType) - << FormatUTFCodeUnitAsCodepoint(RHSValue.getExtValue(), RHSType); + bool LHSSafe = IsSingleCodeUnitCP(LHSType, LHSValue); + bool RHSSafe = IsSingleCodeUnitCP(RHSType, RHSValue); + if (LHSSafe && RHSSafe) return; - } - SemaRef.Diag(Loc, diag::warn_comparison_unicode_mixed_types) - << LHS->getSourceRange() << RHS->getSourceRange() - << LHSType << RHSType; - return; - } - if (SemaRef.getASTContext().hasSameType(LHSType, RHSType)) + SemaRef.Diag(Loc, diag::warn_comparison_unicode_mixed_types_constant) + << LHS->getSourceRange() << RHS->getSourceRange() << LHSType + << RHSType + << FormatUTFCodeUnitAsCodepoint(LHSValue.getExtValue(), LHSType) + << FormatUTFCodeUnitAsCodepoint(RHSValue.getExtValue(), RHSType); return; + } + SemaRef.Diag(Loc, diag::warn_comparison_unicode_mixed_types) + << LHS->getSourceRange() << RHS->getSourceRange() << LHSType << RHSType; + return; + } - SemaRef.Diag(Loc, diag::warn_arith_conv_mixed__unicode_types) - << LHS->getSourceRange() << RHS->getSourceRange() << ACK << LHSType - << RHSType; + if (SemaRef.getASTContext().hasSameType(LHSType, RHSType)) return; + + SemaRef.Diag(Loc, diag::warn_arith_conv_mixed__unicode_types) + << LHS->getSourceRange() << RHS->getSourceRange() << ACK << LHSType + << RHSType; + return; } /// UsualArithmeticConversions - Performs various conversions that are common to @@ -1644,8 +1641,7 @@ QualType Sema::UsualArithmeticConversions(ExprResult &LHS, ExprResult &RHS, checkEnumArithmeticConversions(LHS.get(), RHS.get(), Loc, ACK); - CheckUnicodeArithmeticConversions(*this, LHS.get(), RHS.get(), - Loc, ACK); + CheckUnicodeArithmeticConversions(*this, LHS.get(), RHS.get(), Loc, ACK); if (ACK != ArithConvKind::CompAssign) { LHS = UsualUnaryConversions(LHS.get()); >From e31e747455a0191ae9ada002861e2cf3e8ab59f0 Mon Sep 17 00:00:00 2001 From: Corentin Jabot <corentinja...@gmail.com> Date: Thu, 8 May 2025 15:42:10 +0200 Subject: [PATCH 3/8] Silence warnings in libc++ --- libcxx/include/print | 2 +- .../alg.nonmodifying/alg.equal/equal.pass.cpp | 2 +- .../alg.nonmodifying/alg.find/find.pass.cpp | 2 +- .../test/std/localization/codecvt_unicode.pass.cpp | 12 ++++++------ .../char16_t_char8_t_in.pass.cpp | 2 +- .../char16_t_char8_t_out.pass.cpp | 2 +- .../char32_t_char8_t_in.pass.cpp | 2 +- .../char32_t_char8_t_out.pass.cpp | 2 +- .../assign2.pass.cpp | 4 ++-- 9 files changed, 15 insertions(+), 15 deletions(-) diff --git a/libcxx/include/print b/libcxx/include/print index 61c3ebcd98cb8..be05d30e0147f 100644 --- a/libcxx/include/print +++ b/libcxx/include/print @@ -123,7 +123,7 @@ _LIBCPP_HIDE_FROM_ABI constexpr void __encode(_OutIt& __out_it, char32_t __value _LIBCPP_ASSERT_UNCATEGORIZED(__is_scalar_value(__value), "an invalid unicode scalar value results in invalid UTF-16"); if (__value < 0x10000) { - *__out_it++ = __value; + *__out_it++ = static_cast<iter_value_t<_OutIt>>(__value); return; } diff --git a/libcxx/test/std/algorithms/alg.nonmodifying/alg.equal/equal.pass.cpp b/libcxx/test/std/algorithms/alg.nonmodifying/alg.equal/equal.pass.cpp index 02cc84c288828..b7266d675c2a1 100644 --- a/libcxx/test/std/algorithms/alg.nonmodifying/alg.equal/equal.pass.cpp +++ b/libcxx/test/std/algorithms/alg.nonmodifying/alg.equal/equal.pass.cpp @@ -19,7 +19,7 @@ // equal(Iter1 first1, Iter1 last1, Iter2 first2, Iter2 last2); // We test the cartesian product, so we sometimes compare differently signed types -// ADDITIONAL_COMPILE_FLAGS(gcc-style-warnings): -Wno-sign-compare +// ADDITIONAL_COMPILE_FLAGS(gcc-style-warnings): -Wno-sign-compare -Wno-implicit-unicode-conversion // MSVC warning C4242: 'argument': conversion from 'int' to 'const _Ty', possible loss of data // MSVC warning C4244: 'argument': conversion from 'wchar_t' to 'const _Ty', possible loss of data // MSVC warning C4389: '==': signed/unsigned mismatch diff --git a/libcxx/test/std/algorithms/alg.nonmodifying/alg.find/find.pass.cpp b/libcxx/test/std/algorithms/alg.nonmodifying/alg.find/find.pass.cpp index 3aaeb9c2f345f..cfc60369dd69f 100644 --- a/libcxx/test/std/algorithms/alg.nonmodifying/alg.find/find.pass.cpp +++ b/libcxx/test/std/algorithms/alg.nonmodifying/alg.find/find.pass.cpp @@ -7,7 +7,7 @@ //===----------------------------------------------------------------------===// // ADDITIONAL_COMPILE_FLAGS(gcc): -Wno-bool-compare -// ADDITIONAL_COMPILE_FLAGS(gcc-style-warnings): -Wno-sign-compare +// ADDITIONAL_COMPILE_FLAGS(gcc-style-warnings): -Wno-sign-compare -Wno-implicit-unicode-conversion // MSVC warning C4245: conversion from 'int' to 'wchar_t', signed/unsigned mismatch // MSVC warning C4305: truncation from 'int' to 'bool' // MSVC warning C4310: cast truncates constant value diff --git a/libcxx/test/std/localization/codecvt_unicode.pass.cpp b/libcxx/test/std/localization/codecvt_unicode.pass.cpp index e54c0c2a4610a..7c5f112c7f495 100644 --- a/libcxx/test/std/localization/codecvt_unicode.pass.cpp +++ b/libcxx/test/std/localization/codecvt_unicode.pass.cpp @@ -484,7 +484,7 @@ template <class InternT, class ExternT> void utf8_to_utf16_in_ok(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) { // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP and 4-byte CP const unsigned char input[] = "b\u0448\uAAAA\U0010AAAA"; - const char16_t expected[] = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0}; + const InternT expected[] = {u'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0}; static_assert(array_size(input) == 11, ""); static_assert(array_size(expected) == 6, ""); @@ -549,7 +549,7 @@ template <class InternT, class ExternT> void utf8_to_utf16_in_partial(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) { // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP and 4-byte CP const unsigned char input[] = "b\u0448\uAAAA\U0010AAAA"; - const char16_t expected[] = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0}; + const InternT expected[] = {u'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0}; static_assert(array_size(input) == 11, ""); static_assert(array_size(expected) == 6, ""); @@ -618,7 +618,7 @@ template <class InternT, class ExternT> void utf8_to_utf16_in_error(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) { // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP, 4-byte CP const unsigned char input[] = "b\u0448\uD700\U0010AAAA"; - const char16_t expected[] = {'b', 0x0448, 0xD700, 0xDBEA, 0xDEAA, 0}; + const InternT expected[] = {u'b', 0x0448, 0xD700, 0xDBEA, 0xDEAA, 0}; static_assert(array_size(input) == 11, ""); static_assert(array_size(expected) == 6, ""); @@ -765,7 +765,7 @@ void utf8_to_utf16_in(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) { template <class InternT, class ExternT> void utf16_to_utf8_out_ok(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) { // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP and 4-byte CP - const char16_t input[] = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0}; + const InternT input[] = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0}; const unsigned char expected[] = "b\u0448\uAAAA\U0010AAAA"; static_assert(array_size(input) == 6, ""); static_assert(array_size(expected) == 11, ""); @@ -801,7 +801,7 @@ void utf16_to_utf8_out_ok(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) template <class InternT, class ExternT> void utf16_to_utf8_out_partial(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) { // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP and 4-byte CP - const char16_t input[] = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0}; + const InternT input[] = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0}; const unsigned char expected[] = "b\u0448\uAAAA\U0010AAAA"; static_assert(array_size(input) == 6, ""); static_assert(array_size(expected) == 11, ""); @@ -860,7 +860,7 @@ void utf16_to_utf8_out_partial(const std::codecvt<InternT, ExternT, mbstate_t>& template <class InternT, class ExternT> void utf16_to_utf8_out_error(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) { // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP and 4-byte CP - const char16_t input[] = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0}; + const InternT input[] = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0}; const unsigned char expected[] = "b\u0448\uAAAA\U0010AAAA"; static_assert(array_size(input) == 6, ""); static_assert(array_size(expected) == 11, ""); diff --git a/libcxx/test/std/localization/locale.categories/category.ctype/locale.codecvt/locale.codecvt.members/char16_t_char8_t_in.pass.cpp b/libcxx/test/std/localization/locale.categories/category.ctype/locale.codecvt/locale.codecvt.members/char16_t_char8_t_in.pass.cpp index c34e864220e12..86a08ee32cb45 100644 --- a/libcxx/test/std/localization/locale.categories/category.ctype/locale.codecvt/locale.codecvt.members/char16_t_char8_t_in.pass.cpp +++ b/libcxx/test/std/localization/locale.categories/category.ctype/locale.codecvt/locale.codecvt.members/char16_t_char8_t_in.pass.cpp @@ -33,6 +33,6 @@ int main(int, char**) { assert(from_next - from == 9); assert(to_next - to == 9); for (unsigned i = 0; i < 9; ++i) - assert(to[i] == from[i]); + assert(to[i] == static_cast<char16_t>(from[i])); return 0; } diff --git a/libcxx/test/std/localization/locale.categories/category.ctype/locale.codecvt/locale.codecvt.members/char16_t_char8_t_out.pass.cpp b/libcxx/test/std/localization/locale.categories/category.ctype/locale.codecvt/locale.codecvt.members/char16_t_char8_t_out.pass.cpp index c39e64de7a59f..d5c0c3cf31244 100644 --- a/libcxx/test/std/localization/locale.categories/category.ctype/locale.codecvt/locale.codecvt.members/char16_t_char8_t_out.pass.cpp +++ b/libcxx/test/std/localization/locale.categories/category.ctype/locale.codecvt/locale.codecvt.members/char16_t_char8_t_out.pass.cpp @@ -34,6 +34,6 @@ int main(int, char**) { assert(from_next - from == 9); assert(to_next - to == 9); for (unsigned i = 0; i < 9; ++i) - assert(to[i] == from[i]); + assert(static_cast<char16_t>(to[i]) == from[i]); return 0; } diff --git a/libcxx/test/std/localization/locale.categories/category.ctype/locale.codecvt/locale.codecvt.members/char32_t_char8_t_in.pass.cpp b/libcxx/test/std/localization/locale.categories/category.ctype/locale.codecvt/locale.codecvt.members/char32_t_char8_t_in.pass.cpp index e848f8a10912e..e6af982c10e99 100644 --- a/libcxx/test/std/localization/locale.categories/category.ctype/locale.codecvt/locale.codecvt.members/char32_t_char8_t_in.pass.cpp +++ b/libcxx/test/std/localization/locale.categories/category.ctype/locale.codecvt/locale.codecvt.members/char32_t_char8_t_in.pass.cpp @@ -33,6 +33,6 @@ int main(int, char**) { assert(from_next - from == 9); assert(to_next - to == 9); for (unsigned i = 0; i < 9; ++i) - assert(to[i] == from[i]); + assert(to[i] == static_cast<char32_t>(from[i])); return 0; } diff --git a/libcxx/test/std/localization/locale.categories/category.ctype/locale.codecvt/locale.codecvt.members/char32_t_char8_t_out.pass.cpp b/libcxx/test/std/localization/locale.categories/category.ctype/locale.codecvt/locale.codecvt.members/char32_t_char8_t_out.pass.cpp index 7a31c9ef10558..61a0502022840 100644 --- a/libcxx/test/std/localization/locale.categories/category.ctype/locale.codecvt/locale.codecvt.members/char32_t_char8_t_out.pass.cpp +++ b/libcxx/test/std/localization/locale.categories/category.ctype/locale.codecvt/locale.codecvt.members/char32_t_char8_t_out.pass.cpp @@ -34,6 +34,6 @@ int main(int, char**) { assert(from_next - from == 9); assert(to_next - to == 9); for (unsigned i = 0; i < 9; ++i) - assert(to[i] == from[i]); + assert(static_cast<char32_t>(to[i]) == static_cast<char32_t>(from[i])); return 0; } diff --git a/libcxx/test/std/strings/char.traits/char.traits.specializations/char.traits.specializations.char8_t/assign2.pass.cpp b/libcxx/test/std/strings/char.traits/char.traits.specializations/char.traits.specializations.char8_t/assign2.pass.cpp index e3bc9c3c100d4..971fcd68cc8e6 100644 --- a/libcxx/test/std/strings/char.traits/char.traits.specializations/char.traits.specializations.char8_t/assign2.pass.cpp +++ b/libcxx/test/std/strings/char.traits/char.traits.specializations/char.traits.specializations.char8_t/assign2.pass.cpp @@ -19,9 +19,9 @@ #ifndef TEST_HAS_NO_CHAR8_T constexpr bool test_constexpr() { - char8_t c = u'1'; + char8_t c = u8'1'; std::char_traits<char8_t>::assign(c, u'a'); - return c == u'a'; + return c == u8'a'; } int main(int, char**) { >From 2df2d4844ba1d6a21c8d4677a54858cf085ff886 Mon Sep 17 00:00:00 2001 From: Corentin Jabot <corentinja...@gmail.com> Date: Thu, 8 May 2025 19:07:19 +0200 Subject: [PATCH 4/8] try to fix the libc++ build --- .../std/algorithms/alg.nonmodifying/alg.equal/equal.pass.cpp | 4 +++- .../std/algorithms/alg.nonmodifying/alg.find/find.pass.cpp | 3 ++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/libcxx/test/std/algorithms/alg.nonmodifying/alg.equal/equal.pass.cpp b/libcxx/test/std/algorithms/alg.nonmodifying/alg.equal/equal.pass.cpp index b7266d675c2a1..780d18b364770 100644 --- a/libcxx/test/std/algorithms/alg.nonmodifying/alg.equal/equal.pass.cpp +++ b/libcxx/test/std/algorithms/alg.nonmodifying/alg.equal/equal.pass.cpp @@ -19,7 +19,9 @@ // equal(Iter1 first1, Iter1 last1, Iter2 first2, Iter2 last2); // We test the cartesian product, so we sometimes compare differently signed types -// ADDITIONAL_COMPILE_FLAGS(gcc-style-warnings): -Wno-sign-compare -Wno-implicit-unicode-conversion +// ADDITIONAL_COMPILE_FLAGS(gcc-style-warnings): -Wno-sign-compare +// ADDITIONAL_COMPILE_FLAGS(clang-21): -Wno-implicit-unicode-conversion + // MSVC warning C4242: 'argument': conversion from 'int' to 'const _Ty', possible loss of data // MSVC warning C4244: 'argument': conversion from 'wchar_t' to 'const _Ty', possible loss of data // MSVC warning C4389: '==': signed/unsigned mismatch diff --git a/libcxx/test/std/algorithms/alg.nonmodifying/alg.find/find.pass.cpp b/libcxx/test/std/algorithms/alg.nonmodifying/alg.find/find.pass.cpp index cfc60369dd69f..1d31a43953d3b 100644 --- a/libcxx/test/std/algorithms/alg.nonmodifying/alg.find/find.pass.cpp +++ b/libcxx/test/std/algorithms/alg.nonmodifying/alg.find/find.pass.cpp @@ -7,7 +7,8 @@ //===----------------------------------------------------------------------===// // ADDITIONAL_COMPILE_FLAGS(gcc): -Wno-bool-compare -// ADDITIONAL_COMPILE_FLAGS(gcc-style-warnings): -Wno-sign-compare -Wno-implicit-unicode-conversion +// ADDITIONAL_COMPILE_FLAGS(gcc-style-warnings): -Wno-sign-compare +// ADDITIONAL_COMPILE_FLAGS(clang-21): -Wno-implicit-unicode-conversion // MSVC warning C4245: conversion from 'int' to 'wchar_t', signed/unsigned mismatch // MSVC warning C4305: truncation from 'int' to 'bool' // MSVC warning C4310: cast truncates constant value >From 5e092e78ac6de9df564ee393af7ef6031a95e3ad Mon Sep 17 00:00:00 2001 From: Corentin Jabot <corentinja...@gmail.com> Date: Sat, 10 May 2025 21:51:37 +0200 Subject: [PATCH 5/8] Use -Wcharacter-conversion, add it to -Wconversion, make it a feature for libcxx --- clang/docs/ReleaseNotes.rst | 2 +- clang/include/clang/Basic/DiagnosticGroups.td | 3 ++- clang/include/clang/Basic/DiagnosticSemaKinds.td | 14 +++++++------- .../alg.nonmodifying/alg.equal/equal.pass.cpp | 2 +- .../alg.nonmodifying/alg.find/find.pass.cpp | 2 +- libcxx/utils/libcxx/test/features.py | 4 ++++ 6 files changed, 16 insertions(+), 11 deletions(-) diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 3a42f43d79fd1..ab1ae3ddb48e1 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -503,7 +503,7 @@ Improvements to Clang's diagnostics - ``-Wreserved-identifier`` now fires on reserved parameter names in a function declaration which is not a definition. -- A new ``-Wimplicit-unicode-conversion`` warns where comparing or implicitly converting +- A new ``-Wcharacter-conversion`` warns where comparing or implicitly converting between different Unicode character types (``char8_t``, ``char16_t``, ``char32_t``). This warning only triggers in C++ as these types are aliases in C. (#GH138526) diff --git a/clang/include/clang/Basic/DiagnosticGroups.td b/clang/include/clang/Basic/DiagnosticGroups.td index e5b5dbbd07f10..5bea4f09432b0 100644 --- a/clang/include/clang/Basic/DiagnosticGroups.td +++ b/clang/include/clang/Basic/DiagnosticGroups.td @@ -111,7 +111,7 @@ def EnumConversion : DiagGroup<"enum-conversion", ImplicitEnumEnumCast, EnumFloatConversion, EnumCompareConditional]>; -def ImplicitUnicodeConversion : DiagGroup<"implicit-unicode-conversion">; +def CharacterConversion : DiagGroup<"character-conversion">; def DeprecatedOFast : DiagGroup<"deprecated-ofast">; def ObjCSignedCharBoolImplicitIntConversion : DiagGroup<"objc-signed-char-bool-implicit-int-conversion">; @@ -1074,6 +1074,7 @@ def Parentheses : DiagGroup<"parentheses", // - __null-to-integer conversion warnings are on by default def Conversion : DiagGroup<"conversion", [BoolConversion, + CharacterConversion, ConstantConversion, EnumConversion, BitFieldEnumConversion, diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td index 9cd5d3d36b928..be2791c3ff1fc 100644 --- a/clang/include/clang/Basic/DiagnosticSemaKinds.td +++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -4360,25 +4360,25 @@ def warn_address_of_reference_bool_conversion : Warning< def warn_impcast_unicode_char_type : Warning<"implicit conversion from %0 to %1 may change the meaning of the " "represented code unit">, - InGroup<ImplicitUnicodeConversion>; + InGroup<CharacterConversion>; def warn_impcast_unicode_precision : Warning<"implicit conversion from %0 to %1 may lose precision and change " "the meaning of the represented code unit">, - InGroup<ImplicitUnicodeConversion>; + InGroup<CharacterConversion>; def warn_impcast_unicode_char_type_constant : Warning<"implicit conversion from %0 to %1 changes the meaning of the " "%select{code unit|codepoint}2 '%3'">, - InGroup<ImplicitUnicodeConversion>; + InGroup<CharacterConversion>; def warn_comparison_unicode_mixed_types : Warning<"comparing values of different Unicode code unit types %0 and %1 " "may compare different codepoints">, - InGroup<ImplicitUnicodeConversion>; + InGroup<CharacterConversion>; def warn_comparison_unicode_mixed_types_constant : Warning<"comparing values of different Unicode code unit types %0 and %1 " "compares unrelated code units '%2' and '%3'">, - InGroup<ImplicitUnicodeConversion>; + InGroup<CharacterConversion>; def warn_xor_used_as_pow : Warning< "result of '%0' is %1; did you mean exponentiation?">, @@ -6843,7 +6843,7 @@ def err_counted_by_on_incomplete_type_on_use : Error < def note_counted_by_consider_completing_pointee_ty : Note< "consider providing a complete definition for %0">; - + def note_counted_by_consider_using_sized_by : Note< "consider using '__sized_by%select{|_or_null}0' instead of " "'__counted_by%select{|_or_null}0'">; @@ -7745,7 +7745,7 @@ def warn_comparison_of_mixed_enum_types_switch : Warning< def warn_arith_conv_mixed__unicode_types : Warning<"%sub{select_arith_conv_kind}0 " "different Unicode character types %1 and %2">, - InGroup<ImplicitUnicodeConversion>; + InGroup<CharacterConversion>; def err_typecheck_assign_const : Error< "%select{" diff --git a/libcxx/test/std/algorithms/alg.nonmodifying/alg.equal/equal.pass.cpp b/libcxx/test/std/algorithms/alg.nonmodifying/alg.equal/equal.pass.cpp index 780d18b364770..859532d4b79c7 100644 --- a/libcxx/test/std/algorithms/alg.nonmodifying/alg.equal/equal.pass.cpp +++ b/libcxx/test/std/algorithms/alg.nonmodifying/alg.equal/equal.pass.cpp @@ -20,7 +20,7 @@ // We test the cartesian product, so we sometimes compare differently signed types // ADDITIONAL_COMPILE_FLAGS(gcc-style-warnings): -Wno-sign-compare -// ADDITIONAL_COMPILE_FLAGS(clang-21): -Wno-implicit-unicode-conversion +// ADDITIONAL_COMPILE_FLAGS(character-conversion-warnings): -Wno-character-conversion // MSVC warning C4242: 'argument': conversion from 'int' to 'const _Ty', possible loss of data // MSVC warning C4244: 'argument': conversion from 'wchar_t' to 'const _Ty', possible loss of data diff --git a/libcxx/test/std/algorithms/alg.nonmodifying/alg.find/find.pass.cpp b/libcxx/test/std/algorithms/alg.nonmodifying/alg.find/find.pass.cpp index 1d31a43953d3b..989edcb3f6eed 100644 --- a/libcxx/test/std/algorithms/alg.nonmodifying/alg.find/find.pass.cpp +++ b/libcxx/test/std/algorithms/alg.nonmodifying/alg.find/find.pass.cpp @@ -8,7 +8,7 @@ // ADDITIONAL_COMPILE_FLAGS(gcc): -Wno-bool-compare // ADDITIONAL_COMPILE_FLAGS(gcc-style-warnings): -Wno-sign-compare -// ADDITIONAL_COMPILE_FLAGS(clang-21): -Wno-implicit-unicode-conversion +// ADDITIONAL_COMPILE_FLAGS(character-conversion-warnings): -Wno-character-conversion // MSVC warning C4245: conversion from 'int' to 'wchar_t', signed/unsigned mismatch // MSVC warning C4305: truncation from 'int' to 'bool' // MSVC warning C4310: cast truncates constant value diff --git a/libcxx/utils/libcxx/test/features.py b/libcxx/utils/libcxx/test/features.py index 10fc4b0afde6b..74746e37d3bc4 100644 --- a/libcxx/utils/libcxx/test/features.py +++ b/libcxx/utils/libcxx/test/features.py @@ -144,6 +144,10 @@ def _mingwSupportsModules(cfg): when=lambda cfg: hasCompileFlag(cfg, "-Wuser-defined-warnings"), actions=[AddCompileFlag("-Wuser-defined-warnings")], ), + Feature( + name="character-conversion-warnings", + when=lambda cfg: hasCompileFlag(cfg, "-Wcharacter-conversion"), + ), # Tests to validate whether the compiler has a way to set the maximum number # of steps during constant evaluation. Since the flag differs per compiler # store the "valid" flag as a feature. This allows passing the proper compile >From 8e39dc9c6e82b8418e461b9b99553064dcf09074 Mon Sep 17 00:00:00 2001 From: Corentin Jabot <corentinja...@gmail.com> Date: Sat, 10 May 2025 22:53:44 +0200 Subject: [PATCH 6/8] Don't warn if one side of the comparison can be evaluated to a code point representable in both types --- .../clang/Basic/DiagnosticSemaKinds.td | 6 +- clang/lib/AST/ASTDiagnostic.cpp | 3 +- clang/lib/Sema/SemaChecking.cpp | 1 + clang/lib/Sema/SemaExpr.cpp | 74 +++++++++++-------- .../warn-implicit-unicode-conversions.cpp | 44 +++++------ 5 files changed, 68 insertions(+), 60 deletions(-) diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td index be2791c3ff1fc..686dce9077735 100644 --- a/clang/include/clang/Basic/DiagnosticSemaKinds.td +++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -4367,12 +4367,12 @@ def warn_impcast_unicode_precision InGroup<CharacterConversion>; def warn_impcast_unicode_char_type_constant : Warning<"implicit conversion from %0 to %1 changes the meaning of the " - "%select{code unit|codepoint}2 '%3'">, + "%select{code unit|code point}2 '%3'">, InGroup<CharacterConversion>; def warn_comparison_unicode_mixed_types : Warning<"comparing values of different Unicode code unit types %0 and %1 " - "may compare different codepoints">, + "may compare different code points">, InGroup<CharacterConversion>; def warn_comparison_unicode_mixed_types_constant @@ -7742,7 +7742,7 @@ def warn_comparison_of_mixed_enum_types_switch : Warning< "%diff{ ($ and $)|}0,1">, InGroup<EnumCompareSwitch>; -def warn_arith_conv_mixed__unicode_types +def warn_arith_conv_mixed_unicode_types : Warning<"%sub{select_arith_conv_kind}0 " "different Unicode character types %1 and %2">, InGroup<CharacterConversion>; diff --git a/clang/lib/AST/ASTDiagnostic.cpp b/clang/lib/AST/ASTDiagnostic.cpp index 0c9f50fb1a01c..a00d5801f054b 100644 --- a/clang/lib/AST/ASTDiagnostic.cpp +++ b/clang/lib/AST/ASTDiagnostic.cpp @@ -2203,9 +2203,10 @@ std::string clang::FormatUTFCodeUnitAsCodepoint(unsigned Value, QualType T) { assert(Value <= 0xFFFF && "not a valid UTF-16 code unit"); return llvm::IsSingleCodeUnitUTF16Codepoint(Value); } + assert(T->isChar32Type()); return llvm::IsSingleCodeUnitUTF32Codepoint(Value); }; - llvm::SmallVector<char, 4> Str; + llvm::SmallVector<char, 16> Str; if (!IsSingleCodeUnitCP(Value, T)) { llvm::raw_svector_ostream OS(Str); OS << "<" << llvm::format_hex(Value, 1, /*Upper=*/true) << ">"; diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp index 9361683ff4a8c..13fa2b8ef5143 100644 --- a/clang/lib/Sema/SemaChecking.cpp +++ b/clang/lib/Sema/SemaChecking.cpp @@ -11834,6 +11834,7 @@ static void DiagnoseMixedUnicodeImplicitConversion(Sema &S, const Type *Source, return llvm::IsSingleCodeUnitUTF8Codepoint(Value.getExtValue()); if (T->isChar16Type()) return llvm::IsSingleCodeUnitUTF16Codepoint(Value.getExtValue()); + assert(T->isChar32Type()); return llvm::IsSingleCodeUnitUTF32Codepoint(Value.getExtValue()); }; diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp index a7a7f55f3d34f..e42a85a04f5bf 100644 --- a/clang/lib/Sema/SemaExpr.cpp +++ b/clang/lib/Sema/SemaExpr.cpp @@ -1582,50 +1582,60 @@ static void CheckUnicodeArithmeticConversions(Sema &SemaRef, Expr *LHS, if (SemaRef.getASTContext().hasSameType(LHSType, RHSType)) return; - Expr::EvalResult LHSRes, RHSRes; - bool Success = LHS->EvaluateAsInt(LHSRes, SemaRef.getASTContext(), - Expr::SE_AllowSideEffects, - SemaRef.isConstantEvaluatedContext()); - if (Success) - Success = RHS->EvaluateAsInt(RHSRes, SemaRef.getASTContext(), - Expr::SE_AllowSideEffects, - SemaRef.isConstantEvaluatedContext()); - if (Success) { - llvm::APSInt LHSValue(32); - LHSValue = LHSRes.Val.getInt(); - llvm::APSInt RHSValue(32); - RHSValue = RHSRes.Val.getInt(); - - auto IsSingleCodeUnitCP = [](const QualType &T, - const llvm::APSInt &Value) { - if (T->isChar8Type()) - return llvm::IsSingleCodeUnitUTF8Codepoint(Value.getExtValue()); - if (T->isChar16Type()) - return llvm::IsSingleCodeUnitUTF16Codepoint(Value.getExtValue()); - return llvm::IsSingleCodeUnitUTF32Codepoint(Value.getExtValue()); - }; + auto IsSingleCodeUnitCP = [](const QualType &T, const llvm::APSInt &Value) { + if (T->isChar8Type()) + return llvm::IsSingleCodeUnitUTF8Codepoint(Value.getExtValue()); + if (T->isChar16Type()) + return llvm::IsSingleCodeUnitUTF16Codepoint(Value.getExtValue()); + assert(T->isChar32Type()); + return llvm::IsSingleCodeUnitUTF32Codepoint(Value.getExtValue()); + }; - bool LHSSafe = IsSingleCodeUnitCP(LHSType, LHSValue); - bool RHSSafe = IsSingleCodeUnitCP(RHSType, RHSValue); - if (LHSSafe && RHSSafe) + Expr::EvalResult LHSRes, RHSRes; + bool LHSSuccess = LHS->EvaluateAsInt(LHSRes, SemaRef.getASTContext(), + Expr::SE_AllowSideEffects, + SemaRef.isConstantEvaluatedContext()); + bool RHSuccess = RHS->EvaluateAsInt(RHSRes, SemaRef.getASTContext(), + Expr::SE_AllowSideEffects, + SemaRef.isConstantEvaluatedContext()); + + // Don't warn if the one known value is a representable + // in the type of both expressions. + if (LHSSuccess != RHSuccess) { + Expr::EvalResult &Res = LHSSuccess ? LHSRes : RHSRes; + if (IsSingleCodeUnitCP(LHSType, Res.Val.getInt()) && + IsSingleCodeUnitCP(RHSType, Res.Val.getInt())) return; + } - SemaRef.Diag(Loc, diag::warn_comparison_unicode_mixed_types_constant) + if (!LHSSuccess || !RHSuccess) { + SemaRef.Diag(Loc, diag::warn_comparison_unicode_mixed_types) << LHS->getSourceRange() << RHS->getSourceRange() << LHSType - << RHSType - << FormatUTFCodeUnitAsCodepoint(LHSValue.getExtValue(), LHSType) - << FormatUTFCodeUnitAsCodepoint(RHSValue.getExtValue(), RHSType); + << RHSType; return; } - SemaRef.Diag(Loc, diag::warn_comparison_unicode_mixed_types) - << LHS->getSourceRange() << RHS->getSourceRange() << LHSType << RHSType; + + llvm::APSInt LHSValue(32); + LHSValue = LHSRes.Val.getInt(); + llvm::APSInt RHSValue(32); + RHSValue = RHSRes.Val.getInt(); + + bool LHSSafe = IsSingleCodeUnitCP(LHSType, LHSValue); + bool RHSSafe = IsSingleCodeUnitCP(RHSType, RHSValue); + if (LHSSafe && RHSSafe) + return; + + SemaRef.Diag(Loc, diag::warn_comparison_unicode_mixed_types_constant) + << LHS->getSourceRange() << RHS->getSourceRange() << LHSType << RHSType + << FormatUTFCodeUnitAsCodepoint(LHSValue.getExtValue(), LHSType) + << FormatUTFCodeUnitAsCodepoint(RHSValue.getExtValue(), RHSType); return; } if (SemaRef.getASTContext().hasSameType(LHSType, RHSType)) return; - SemaRef.Diag(Loc, diag::warn_arith_conv_mixed__unicode_types) + SemaRef.Diag(Loc, diag::warn_arith_conv_mixed_unicode_types) << LHS->getSourceRange() << RHS->getSourceRange() << ACK << LHSType << RHSType; return; diff --git a/clang/test/SemaCXX/warn-implicit-unicode-conversions.cpp b/clang/test/SemaCXX/warn-implicit-unicode-conversions.cpp index 41794b15175b5..fcff006d0e028 100644 --- a/clang/test/SemaCXX/warn-implicit-unicode-conversions.cpp +++ b/clang/test/SemaCXX/warn-implicit-unicode-conversions.cpp @@ -19,12 +19,12 @@ void test(char8_t u8, char16_t u16, char32_t u32) { c8(char32_t(0x7f)); - c8(char32_t(0x80)); // expected-warning {{implicit conversion from 'char32_t' to 'char8_t' changes the meaning of the codepoint '<U+0080>'}} + c8(char32_t(0x80)); // expected-warning {{implicit conversion from 'char32_t' to 'char8_t' changes the meaning of the code point '<U+0080>'}} c8(char16_t(0x7f)); - c8(char16_t(0x80)); // expected-warning {{implicit conversion from 'char16_t' to 'char8_t' changes the meaning of the codepoint '<U+0080>'}} + c8(char16_t(0x80)); // expected-warning {{implicit conversion from 'char16_t' to 'char8_t' changes the meaning of the code point '<U+0080>'}} c8(char16_t(0xD800)); // expected-warning {{implicit conversion from 'char16_t' to 'char8_t' changes the meaning of the code unit '<0xD800>'}} - c8(char16_t(0xE000)); // expected-warning {{implicit conversion from 'char16_t' to 'char8_t' changes the meaning of the codepoint '<U+E000>'}} + c8(char16_t(0xE000)); // expected-warning {{implicit conversion from 'char16_t' to 'char8_t' changes the meaning of the code point '<U+E000>'}} c16(char32_t(0x7f)); @@ -32,7 +32,7 @@ void test(char8_t u8, char16_t u16, char32_t u32) { c16(char32_t(0xD7FF)); c16(char32_t(0xD800)); // expected-warning {{implicit conversion from 'char32_t' to 'char16_t' changes the meaning of the code unit '<0xD800>'}} c16(char32_t(0xE000)); - c16(char32_t(U'🐉')); // expected-warning {{implicit conversion from 'char32_t' to 'char16_t' changes the meaning of the codepoint '🐉'}} + c16(char32_t(U'🐉')); // expected-warning {{implicit conversion from 'char32_t' to 'char16_t' changes the meaning of the code point '🐉'}} c32(char8_t(0x7f)); @@ -49,7 +49,7 @@ void test(char8_t u8, char16_t u16, char32_t u32) { c32(char16_t(0xE000)); c32(char16_t(u'☕')); - (void)static_cast<char32_t>(char8_t(0x80)); // sanity check: no explicit conversion; + (void)static_cast<char32_t>(char8_t(0x80)); //no warnings for explicit conversions. using Char8 = char8_t; Char8 c81 = u16; // expected-warning {{implicit conversion from 'char16_t' to 'Char8' (aka 'char8_t') may lose precision and change the meaning of the represented code unit}} @@ -63,31 +63,27 @@ void test(char8_t u8, char16_t u16, char32_t u32) { void test_comp(char8_t u8, char16_t u16, char32_t u32) { (void)(u8 == u8' '); - (void)(u8 == u' '); // expected-warning{{comparing values of different Unicode code unit types 'char8_t' and 'char16_t' may compare different codepoints}} - (void)(u8 == U' '); // expected-warning{{comparing values of different Unicode code unit types 'char8_t' and 'char32_t' may compare different codepoints}} + (void)(u8 == u' '); + (void)(u8 == U' '); - (void)(u16 == u8' '); // expected-warning{{comparing values of different Unicode code unit types 'char16_t' and 'char8_t' may compare different codepoints}} - (void)(u16 == u' '); - (void)(u16 == U' '); // expected-warning{{comparing values of different Unicode code unit types 'char16_t' and 'char32_t' may compare different codepoints}} + (void)(u16 == u8' '); + (void)(u16 == U' '); - (void)(u32 == u8' '); // expected-warning{{comparing values of different Unicode code unit types 'char32_t' and 'char8_t' may compare different codepoints}} - (void)(u32 == u' '); // expected-warning{{comparing values of different Unicode code unit types 'char32_t' and 'char16_t' may compare different codepoints}} + (void)(u32 == u8' '); + (void)(u32 == u' '); (void)(u32 == U' '); + (void)(u8 == u'\u00FF'); // expected-warning{{comparing values of different Unicode code unit types 'char8_t' and 'char16_t' may compare different code points}} + (void)(u8 == U'\u00FF'); // expected-warning{{comparing values of different Unicode code unit types 'char8_t' and 'char32_t' may compare different code points}} - (void)(u8' ' == u' '); - (void)(u8' ' == u' '); - - - (void)(u8 == U' '); // expected-warning{{comparing values of different Unicode code unit types 'char8_t' and 'char32_t' may compare different codepoints}} - (void)(u16 == u8' '); // expected-warning{{comparing values of different Unicode code unit types 'char16_t' and 'char8_t' may compare different codepoints}} - (void)(u16 == u' '); - (void)(u16 == U' '); // expected-warning{{comparing values of different Unicode code unit types 'char16_t' and 'char32_t' may compare different codepoints}} - - (void)(u32 == u8' '); // expected-warning{{comparing values of different Unicode code unit types 'char32_t' and 'char8_t' may compare different codepoints}} - (void)(u32 == u' '); // expected-warning{{comparing values of different Unicode code unit types 'char32_t' and 'char16_t' may compare different codepoints}} - (void)(u32 == U' '); + (void)(u16 == u8'\xFF'); // expected-warning{{comparing values of different Unicode code unit types 'char16_t' and 'char8_t' may compare different code points}} + (void)(u16 == u'\u00FF'); + (void)(u16 == U'\u00FF'); + (void)(u16 == U'\xD800'); // expected-warning{{comparing values of different Unicode code unit types 'char16_t' and 'char32_t' may compare different code points}} + (void)(u32 == u8'\xFF'); // expected-warning{{comparing values of different Unicode code unit types 'char32_t' and 'char8_t' may compare different code points}} + (void)(u32 == u'\u00FF'); + (void)(u32 == u'\xD800'); // expected-warning{{comparing values of different Unicode code unit types 'char32_t' and 'char16_t' may compare different code points}} (void)(char8_t(0x7f) == char8_t(0x7f)); (void)(char8_t(0x7f) == char16_t(0x7f)); >From c299893015da0e0727139d128c3ef4ac4e686929 Mon Sep 17 00:00:00 2001 From: Corentin Jabot <corentinja...@gmail.com> Date: Sat, 10 May 2025 23:15:45 +0200 Subject: [PATCH 7/8] libc++ 03 fixes --- libcxx/test/std/localization/codecvt_unicode.pass.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/libcxx/test/std/localization/codecvt_unicode.pass.cpp b/libcxx/test/std/localization/codecvt_unicode.pass.cpp index 7c5f112c7f495..fed183ee0e71f 100644 --- a/libcxx/test/std/localization/codecvt_unicode.pass.cpp +++ b/libcxx/test/std/localization/codecvt_unicode.pass.cpp @@ -484,7 +484,7 @@ template <class InternT, class ExternT> void utf8_to_utf16_in_ok(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) { // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP and 4-byte CP const unsigned char input[] = "b\u0448\uAAAA\U0010AAAA"; - const InternT expected[] = {u'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0}; + const InternT expected[] = {0x61, 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0}; static_assert(array_size(input) == 11, ""); static_assert(array_size(expected) == 6, ""); @@ -549,7 +549,7 @@ template <class InternT, class ExternT> void utf8_to_utf16_in_partial(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) { // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP and 4-byte CP const unsigned char input[] = "b\u0448\uAAAA\U0010AAAA"; - const InternT expected[] = {u'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0}; + const InternT expected[] = {0x61, 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0}; static_assert(array_size(input) == 11, ""); static_assert(array_size(expected) == 6, ""); @@ -618,7 +618,7 @@ template <class InternT, class ExternT> void utf8_to_utf16_in_error(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) { // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP, 4-byte CP const unsigned char input[] = "b\u0448\uD700\U0010AAAA"; - const InternT expected[] = {u'b', 0x0448, 0xD700, 0xDBEA, 0xDEAA, 0}; + const InternT expected[] = {0x61, 0x0448, 0xD700, 0xDBEA, 0xDEAA, 0}; static_assert(array_size(input) == 11, ""); static_assert(array_size(expected) == 6, ""); >From 9a17333fb5965f226be4f6d3783513da74571671 Mon Sep 17 00:00:00 2001 From: Corentin Jabot <corentinja...@gmail.com> Date: Sun, 11 May 2025 10:21:59 +0200 Subject: [PATCH 8/8] I guess I don't know my ascii tables... --- libcxx/test/std/localization/codecvt_unicode.pass.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/libcxx/test/std/localization/codecvt_unicode.pass.cpp b/libcxx/test/std/localization/codecvt_unicode.pass.cpp index fed183ee0e71f..da1acc8061fe1 100644 --- a/libcxx/test/std/localization/codecvt_unicode.pass.cpp +++ b/libcxx/test/std/localization/codecvt_unicode.pass.cpp @@ -484,7 +484,7 @@ template <class InternT, class ExternT> void utf8_to_utf16_in_ok(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) { // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP and 4-byte CP const unsigned char input[] = "b\u0448\uAAAA\U0010AAAA"; - const InternT expected[] = {0x61, 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0}; + const InternT expected[] = {0x62, 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0}; static_assert(array_size(input) == 11, ""); static_assert(array_size(expected) == 6, ""); @@ -549,7 +549,7 @@ template <class InternT, class ExternT> void utf8_to_utf16_in_partial(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) { // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP and 4-byte CP const unsigned char input[] = "b\u0448\uAAAA\U0010AAAA"; - const InternT expected[] = {0x61, 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0}; + const InternT expected[] = {0x62, 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0}; static_assert(array_size(input) == 11, ""); static_assert(array_size(expected) == 6, ""); @@ -618,7 +618,7 @@ template <class InternT, class ExternT> void utf8_to_utf16_in_error(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) { // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP, 4-byte CP const unsigned char input[] = "b\u0448\uD700\U0010AAAA"; - const InternT expected[] = {0x61, 0x0448, 0xD700, 0xDBEA, 0xDEAA, 0}; + const InternT expected[] = {0x62, 0x0448, 0xD700, 0xDBEA, 0xDEAA, 0}; static_assert(array_size(input) == 11, ""); static_assert(array_size(expected) == 6, ""); _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits