abhina.sreeskantharajan created this revision.
Herald added a project: All.
abhina.sreeskantharajan requested review of this revision.
Herald added a project: clang.
Herald added a subscriber: cfe-commits.
This patch adds iconv support to the CharSetConverter class.
Repository:
rG LLVM Github Monorepo
https://reviews.llvm.org/D153418
Files:
clang/include/clang/Basic/CharSet.h
clang/include/clang/Config/config.h.cmake
clang/lib/Basic/CMakeLists.txt
clang/lib/Basic/CharSet.cpp
clang/unittests/Basic/CharSetTest.cpp
Index: clang/unittests/Basic/CharSetTest.cpp
===================================================================
--- clang/unittests/Basic/CharSetTest.cpp
+++ clang/unittests/Basic/CharSetTest.cpp
@@ -40,6 +40,29 @@
// String with Cyrillic character ya.
static const char CyrillicUTF[] = "\xd0\xaf";
+// String "Earthå°ç".
+// ISO-2022-JP: Sequence ESC $ B (\x1B\x24\x42) switches to JIS X 0208-1983, and
+// sequence ESC ( B (\x1B\x28\x42) switches back to ASCII.
+// IBM-939: Byte 0x0E shifts from single byte to double byte, and 0x0F shifts
+// back.
+static const char EarthUTF[] = "\x45\x61\x72\x74\x68\xe5\x9c\xb0\xe7\x90\x83";
+// Identical to above, except the final character (ç) has its last byte taken
+// away from it.
+static const char EarthUTFBroken[] = "\x45\x61\x72\x74\x68\xe5\x9c\xb0\xe7\x90";
+static const char EarthISO2022[] =
+ "\x45\x61\x72\x74\x68\x1B\x24\x42\x43\x4F\x35\x65\x1B\x28\x42";
+static const char EarthISO2022ShiftBack[] =
+ "\x45\x61\x72\x74\x68\x1B\x24\x42\x43\x4F\x35\x65";
+static const char EarthIBM939[] =
+ "\xc5\x81\x99\xa3\x88\x0e\x45\xc2\x48\xdb\x0f";
+static const char ShiftBackOnly[] = "\x1B\x28\x42";
+
+// String "å°ç".
+static const char EarthKanjiOnlyUTF[] = "\xe5\x9c\xb0\xe7\x90\x83";
+static const char EarthKanjiOnlyISO2022[] =
+ "\x1B\x24\x42\x43\x4F\x35\x65\x1b\x28\x42";
+static const char EarthKanjiOnlyIBM939[] = "\x0e\x45\xc2\x48\xdb\x0f";
+
TEST(CharSet, FromUTF8) {
// Hello string.
StringRef Src(HelloA);
@@ -98,4 +121,154 @@
EXPECT_STREQ(AccentUTF, static_cast<std::string>(Dst).c_str());
}
+TEST(CharSet, RoundTrip) {
+ ErrorOr<CharSetConverter> ConvToUTF16 =
+ CharSetConverter::create("IBM-1047", "UTF-16");
+ // Stop test if conversion is not supported (no underlying iconv support).
+ if (!ConvToUTF16) {
+ ASSERT_EQ(ConvToUTF16.getError(),
+ std::make_error_code(std::errc::invalid_argument));
+ return;
+ }
+ ErrorOr<CharSetConverter> ConvToUTF32 =
+ CharSetConverter::create("UTF-16", "UTF-32");
+ // Stop test if conversion is not supported (no underlying iconv support).
+ if (!ConvToUTF32) {
+ ASSERT_EQ(ConvToUTF32.getError(),
+ std::make_error_code(std::errc::invalid_argument));
+ return;
+ }
+ ErrorOr<CharSetConverter> ConvToEBCDIC =
+ CharSetConverter::create("UTF-32", "IBM-1047");
+ // Stop test if conversion is not supported (no underlying iconv support).
+ if (!ConvToEBCDIC) {
+ ASSERT_EQ(ConvToEBCDIC.getError(),
+ std::make_error_code(std::errc::invalid_argument));
+ return;
+ }
+
+ // Setup source string.
+ char SrcStr[256];
+ for (size_t I = 0; I < 256; ++I)
+ SrcStr[I] = (I + 1) % 256;
+
+ SmallString<99> Dst1Str, Dst2Str, Dst3Str;
+
+ std::error_code EC = ConvToUTF16->convert(StringRef(SrcStr), Dst1Str, true);
+ EXPECT_TRUE(!EC);
+ EC = ConvToUTF32->convert(Dst1Str, Dst2Str, true);
+ EXPECT_TRUE(!EC);
+ EC = ConvToEBCDIC->convert(Dst2Str, Dst3Str, true);
+ EXPECT_TRUE(!EC);
+ EXPECT_STREQ(SrcStr, static_cast<std::string>(Dst3Str).c_str());
+}
+
+TEST(CharSet, ShiftState2022) {
+ // Earth string.
+ StringRef Src(EarthUTF);
+ SmallString<64> Dst;
+
+ ErrorOr<CharSetConverter> ConvTo2022 =
+ CharSetConverter::create("UTF-8", "ISO-2022-JP");
+ // Stop test if conversion is not supported (no underlying iconv support).
+ if (!ConvTo2022) {
+ ASSERT_EQ(ConvTo2022.getError(),
+ std::make_error_code(std::errc::invalid_argument));
+ return;
+ }
+
+ // Check that the string is properly converted.
+ std::error_code EC = ConvTo2022->convert(Src, Dst, true);
+ EXPECT_TRUE(!EC);
+ EXPECT_STREQ(EarthISO2022, static_cast<std::string>(Dst).c_str());
+}
+
+TEST(CharSet, ShiftState2022Flush) {
+ StringRef Src0(EarthUTFBroken);
+ StringRef Src1(EarthKanjiOnlyUTF);
+ SmallString<64> Dst0;
+ SmallString<64> Dst1;
+ ErrorOr<CharSetConverter> ConvTo2022Flush =
+ CharSetConverter::create("UTF-8", "ISO-2022-JP");
+ if (!ConvTo2022Flush) {
+ ASSERT_EQ(ConvTo2022Flush.getError(),
+ std::make_error_code(std::errc::invalid_argument));
+ return;
+ }
+
+ // This should emit an error; there is a malformed multibyte character in the
+ // input string.
+ std::error_code EC0 = ConvTo2022Flush->convert(Src0, Dst0, true);
+ EXPECT_TRUE(EC0);
+ std::error_code EC1 = ConvTo2022Flush->flush();
+ EXPECT_TRUE(!EC1);
+ std::error_code EC2 = ConvTo2022Flush->convert(Src1, Dst1, true);
+ EXPECT_TRUE(!EC2);
+ EXPECT_STREQ(EarthKanjiOnlyISO2022, static_cast<std::string>(Dst1).c_str());
+}
+
+TEST(CharSet, ShiftStateIBM939) {
+ // Earth string.
+ StringRef Src(EarthUTF);
+ SmallString<64> Dst;
+
+ ErrorOr<CharSetConverter> ConvToIBM939 =
+ CharSetConverter::create("UTF-8", "IBM-939");
+ // Stop test if conversion is not supported (no underlying iconv support).
+ if (!ConvToIBM939) {
+ ASSERT_EQ(ConvToIBM939.getError(),
+ std::make_error_code(std::errc::invalid_argument));
+ return;
+ }
+
+ // Check that the string is properly converted.
+ std::error_code EC = ConvToIBM939->convert(Src, Dst, true);
+ EXPECT_TRUE(!EC);
+ EXPECT_STREQ(EarthIBM939, static_cast<std::string>(Dst).c_str());
+}
+
+TEST(CharSet, ShiftStateIBM939Flush) {
+ StringRef Src0(EarthUTFBroken);
+ StringRef Src1(EarthKanjiOnlyUTF);
+ SmallString<64> Dst0;
+ SmallString<64> Dst1;
+ ErrorOr<CharSetConverter> ConvTo939Flush =
+ CharSetConverter::create("UTF-8", "IBM-939");
+ if (!ConvTo939Flush) {
+ ASSERT_EQ(ConvTo939Flush.getError(),
+ std::make_error_code(std::errc::invalid_argument));
+ return;
+ }
+
+ // This should emit an error; there is a malformed multibyte character in the
+ // input string.
+ std::error_code EC0 = ConvTo939Flush->convert(Src0, Dst0, true);
+ EXPECT_TRUE(EC0);
+ std::error_code EC1 = ConvTo939Flush->flush();
+ EXPECT_TRUE(!EC1);
+ std::error_code EC2 = ConvTo939Flush->convert(Src1, Dst1, true);
+ EXPECT_TRUE(!EC2);
+ EXPECT_STREQ(EarthKanjiOnlyIBM939, static_cast<std::string>(Dst1).c_str());
+}
+
+TEST(CharSet, ShiftState2022Flush1) {
+ StringRef Src0(EarthUTF);
+ SmallString<64> Dst0;
+ SmallString<64> Dst1;
+ ErrorOr<CharSetConverter> ConvTo2022Flush =
+ CharSetConverter::create("UTF-8", "ISO-2022-JP");
+ if (!ConvTo2022Flush) {
+ ASSERT_EQ(ConvTo2022Flush.getError(),
+ std::make_error_code(std::errc::invalid_argument));
+ return;
+ }
+
+ std::error_code EC0 = ConvTo2022Flush->convert(Src0, Dst0, false);
+ EXPECT_TRUE(!EC0);
+ EXPECT_STREQ(EarthISO2022ShiftBack, static_cast<std::string>(Dst0).c_str());
+ std::error_code EC1 = ConvTo2022Flush->flush(Dst1);
+ EXPECT_TRUE(!EC1);
+ EXPECT_STREQ(ShiftBackOnly, static_cast<std::string>(Dst1).c_str());
+}
+
} // namespace
Index: clang/lib/Basic/CharSet.cpp
===================================================================
--- clang/lib/Basic/CharSet.cpp
+++ clang/lib/Basic/CharSet.cpp
@@ -22,6 +22,10 @@
#include <limits>
#include <system_error>
+#ifdef HAVE_ICONV
+#include <iconv.h>
+#endif
+
using namespace llvm;
// Normalize the charset name with the charset alias matching algorithm proposed
@@ -97,6 +101,132 @@
return std::error_code();
}
+#ifdef HAVE_ICONV
+class CharSetConverterIconv : public details::CharSetConverterImplBase {
+ iconv_t ConvDesc;
+
+public:
+ CharSetConverterIconv(iconv_t ConvDesc) : ConvDesc(ConvDesc) {}
+
+ std::error_code convert(StringRef Source, SmallVectorImpl<char> &Result,
+ bool ShouldAutoFlush) const override;
+ std::error_code flush() const override;
+ std::error_code flush(SmallVectorImpl<char> &Result) const override;
+};
+
+std::error_code CharSetConverterIconv::convert(StringRef Source,
+ SmallVectorImpl<char> &Result,
+ bool ShouldAutoFlush) const {
+ // Setup the input. Use nullptr to reset iconv state if input length is zero.
+ size_t InputLength = Source.size();
+ char *Input = InputLength ? const_cast<char *>(Source.data()) : nullptr;
+ // Setup the output. We directly write into the SmallVector.
+ size_t Capacity = Result.capacity();
+ Result.resize_for_overwrite(Capacity);
+ char *Output = InputLength ? static_cast<char *>(Result.data()) : nullptr;
+ size_t OutputLength = Capacity;
+
+ size_t Ret;
+
+ // Handle errors returned from iconv().
+ auto HandleError = [&Capacity, &Output, &OutputLength, &Result](size_t Ret) {
+ if (Ret == static_cast<size_t>(-1)) {
+ // An error occured. Check if we can gracefully handle it.
+ if (errno == E2BIG && Capacity < std::numeric_limits<size_t>::max()) {
+ // No space left in output buffer. Double the size of the underlying
+ // memory in the SmallVectorImpl, adjust pointer and length and continue
+ // the conversion.
+ const size_t Used = Capacity - OutputLength;
+ Capacity = (Capacity < std::numeric_limits<size_t>::max() / 2)
+ ? 2 * Capacity
+ : std::numeric_limits<size_t>::max();
+ Result.resize_for_overwrite(Capacity);
+ Output = static_cast<char *>(Result.data()) + Used;
+ OutputLength = Capacity - Used;
+ return std::error_code();
+ } else {
+ // Some other error occured.
+ return std::error_code(errno, std::generic_category());
+ }
+ } else {
+ // A positive return value indicates that some characters were converted
+ // in a nonreversible way, that is, replaced with a SUB symbol. Returning
+ // an error in this case makes sure that both conversion routines behave
+ // in the same way.
+ return std::make_error_code(std::errc::illegal_byte_sequence);
+ }
+ };
+
+ // Convert the string.
+ while ((Ret = iconv(ConvDesc, &Input, &InputLength, &Output, &OutputLength)))
+ if (auto EC = HandleError(Ret))
+ return EC;
+ if (ShouldAutoFlush) {
+ while ((Ret = iconv(ConvDesc, nullptr, nullptr, &Output, &OutputLength)))
+ if (auto EC = HandleError(Ret))
+ return EC;
+ }
+
+ // Re-adjust size to actual size.
+ Result.resize(Capacity - OutputLength);
+ return std::error_code();
+}
+
+std::error_code CharSetConverterIconv::flush() const {
+ size_t Ret = iconv(ConvDesc, nullptr, nullptr, nullptr, nullptr);
+ if (Ret == static_cast<size_t>(-1)) {
+ return std::error_code(errno, std::generic_category());
+ }
+ return std::error_code();
+}
+
+std::error_code
+CharSetConverterIconv::flush(SmallVectorImpl<char> &Result) const {
+ char *Output = Result.data();
+ size_t OutputLength = Result.capacity();
+ size_t Capacity = Result.capacity();
+ Result.resize_for_overwrite(Capacity);
+
+ // Handle errors returned from iconv().
+ auto HandleError = [&Capacity, &Output, &OutputLength, &Result](size_t Ret) {
+ if (Ret == static_cast<size_t>(-1)) {
+ // An error occured. Check if we can gracefully handle it.
+ if (errno == E2BIG && Capacity < std::numeric_limits<size_t>::max()) {
+ // No space left in output buffer. Increase the size of the underlying
+ // memory in the SmallVectorImpl by 2 bytes, adjust pointer and length
+ // and continue the conversion.
+ const size_t Used = Capacity - OutputLength;
+ Capacity = (Capacity < std::numeric_limits<size_t>::max() - 2)
+ ? 2 + Capacity
+ : std::numeric_limits<size_t>::max();
+ Result.resize_for_overwrite(Capacity);
+ Output = static_cast<char *>(Result.data()) + Used;
+ OutputLength = Capacity - Used;
+ return std::error_code();
+ } else {
+ // Some other error occured.
+ return std::error_code(errno, std::generic_category());
+ }
+ } else {
+ // A positive return value indicates that some characters were converted
+ // in a nonreversible way, that is, replaced with a SUB symbol. Returning
+ // an error in this case makes sure that both conversion routines behave
+ // in the same way.
+ return std::make_error_code(std::errc::illegal_byte_sequence);
+ }
+ };
+
+ size_t Ret;
+ while ((Ret = iconv(ConvDesc, nullptr, nullptr, &Output, &OutputLength)))
+ if (auto EC = HandleError(Ret))
+ return EC;
+
+ // Re-adjust size to actual size.
+ Result.resize(Capacity - OutputLength);
+ return std::error_code();
+}
+
+#endif // HAVE_ICONV
} // namespace
CharSetConverter CharSetConverter::create(text_encoding::id CPFrom,
@@ -120,5 +250,13 @@
std::optional<text_encoding::id> To = getKnownCharSet(CSTo);
if (From && To)
return create(*From, *To);
+#if HAVE_ICONV
+ iconv_t ConvDesc = iconv_open(CSTo.str().c_str(), CSFrom.str().c_str());
+ if (ConvDesc == (iconv_t)-1)
+ return std::error_code(errno, std::generic_category());
+ std::unique_ptr<details::CharSetConverterImplBase> Converter =
+ std::make_unique<CharSetConverterIconv>(ConvDesc);
+ return CharSetConverter(std::move(Converter));
+#endif
return std::make_error_code(std::errc::invalid_argument);
}
Index: clang/lib/Basic/CMakeLists.txt
===================================================================
--- clang/lib/Basic/CMakeLists.txt
+++ clang/lib/Basic/CMakeLists.txt
@@ -51,6 +51,17 @@
PROPERTIES COMPILE_DEFINITIONS "CLANG_VENDOR=\"${CLANG_VENDOR} \"")
endif()
+# Link iconv library if it is an external library.
+find_package(Iconv)
+if(Iconv_FOUND)
+ set(HAVE_ICONV 1)
+else()
+ set(HAVE_ICONV 0)
+endif()
+if(Iconv_FOUND AND NOT Iconv_IS_BUILT_IN)
+ set(system_libs ${system_libs} ${Iconv_LIBRARIES})
+endif()
+
add_clang_library(clangBasic
Attributes.cpp
Builtins.cpp
Index: clang/include/clang/Config/config.h.cmake
===================================================================
--- clang/include/clang/Config/config.h.cmake
+++ clang/include/clang/Config/config.h.cmake
@@ -57,6 +57,9 @@
/* Define if we have sys/resource.h (rlimits) */
#cmakedefine CLANG_HAVE_RLIMITS ${CLANG_HAVE_RLIMITS}
+/* Define if iconv library is available */
+#cmakedefine HAVE_ICONV ${HAVE_ICONV}
+
/* Linker version detected at compile time. */
#cmakedefine HOST_LINK_VERSION "${HOST_LINK_VERSION}"
Index: clang/include/clang/Basic/CharSet.h
===================================================================
--- clang/include/clang/Basic/CharSet.h
+++ clang/include/clang/Basic/CharSet.h
@@ -51,7 +51,10 @@
/// In case of an error, the result string contains the successfully converted
/// part of the input string.
///
-
+ /// If the Source parameter has a zero length, then no conversion is
+ /// performed. Instead, the internal conversation state of iconv is reset to
+ /// the initial state if iconv is used for the conversion. Otherwise it is a
+ /// no-op.
virtual std::error_code convert(StringRef Source,
SmallVectorImpl<char> &Result,
bool ShouldAutoFlush) const = 0;
@@ -81,6 +84,8 @@
/// Utility class to convert between different character set encodings.
/// The class always supports converting between EBCDIC 1047 and Latin-1/UTF-8.
+/// If the iconv library is available, then arbitrary conversions are supported.
+/// TODO Add Windows support.
class CharSetConverter {
// details::CharSetConverterImplBase *Converter;
std::unique_ptr<details::CharSetConverterImplBase> Converter;
_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits