[PATCH] D153418: Adding iconv support to CharSetConverter class

Abhina Sree via Phabricator via cfe-commits Wed, 21 Jun 2023 06:20:03 -0700

abhina.sreeskantharajan created this revision.
Herald added a project: All.
abhina.sreeskantharajan requested review of this revision.
Herald added a project: clang.
Herald added a subscriber: cfe-commits.


This patch adds iconv support to the CharSetConverter class.


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D153418

Files:
  clang/include/clang/Basic/CharSet.h
  clang/include/clang/Config/config.h.cmake
  clang/lib/Basic/CMakeLists.txt
  clang/lib/Basic/CharSet.cpp
  clang/unittests/Basic/CharSetTest.cpp

Index: clang/unittests/Basic/CharSetTest.cpp
===================================================================
--- clang/unittests/Basic/CharSetTest.cpp
+++ clang/unittests/Basic/CharSetTest.cpp
@@ -40,6 +40,29 @@
 // String with Cyrillic character ya.
 static const char CyrillicUTF[] = "\xd0\xaf";
 
+// String "Earthå°ç".
+// ISO-2022-JP: Sequence ESC $ B (\x1B\x24\x42) switches to JIS X 0208-1983, and
+// sequence ESC ( B (\x1B\x28\x42) switches back to ASCII.
+// IBM-939: Byte 0x0E shifts from single byte to double byte, and 0x0F shifts
+// back.
+static const char EarthUTF[] = "\x45\x61\x72\x74\x68\xe5\x9c\xb0\xe7\x90\x83";
+// Identical to above, except the final character (ç) has its last byte taken
+// away from it.
+static const char EarthUTFBroken[] = "\x45\x61\x72\x74\x68\xe5\x9c\xb0\xe7\x90";
+static const char EarthISO2022[] =
+    "\x45\x61\x72\x74\x68\x1B\x24\x42\x43\x4F\x35\x65\x1B\x28\x42";
+static const char EarthISO2022ShiftBack[] =
+    "\x45\x61\x72\x74\x68\x1B\x24\x42\x43\x4F\x35\x65";
+static const char EarthIBM939[] =
+    "\xc5\x81\x99\xa3\x88\x0e\x45\xc2\x48\xdb\x0f";
+static const char ShiftBackOnly[] = "\x1B\x28\x42";
+
+// String "å°ç".
+static const char EarthKanjiOnlyUTF[] = "\xe5\x9c\xb0\xe7\x90\x83";
+static const char EarthKanjiOnlyISO2022[] =
+    "\x1B\x24\x42\x43\x4F\x35\x65\x1b\x28\x42";
+static const char EarthKanjiOnlyIBM939[] = "\x0e\x45\xc2\x48\xdb\x0f";
+
 TEST(CharSet, FromUTF8) {
   // Hello string.
   StringRef Src(HelloA);
@@ -98,4 +121,154 @@
   EXPECT_STREQ(AccentUTF, static_cast<std::string>(Dst).c_str());
 }
 
+TEST(CharSet, RoundTrip) {
+  ErrorOr<CharSetConverter> ConvToUTF16 =
+      CharSetConverter::create("IBM-1047", "UTF-16");
+  // Stop test if conversion is not supported (no underlying iconv support).
+  if (!ConvToUTF16) {
+    ASSERT_EQ(ConvToUTF16.getError(),
+              std::make_error_code(std::errc::invalid_argument));
+    return;
+  }
+  ErrorOr<CharSetConverter> ConvToUTF32 =
+      CharSetConverter::create("UTF-16", "UTF-32");
+  // Stop test if conversion is not supported (no underlying iconv support).
+  if (!ConvToUTF32) {
+    ASSERT_EQ(ConvToUTF32.getError(),
+              std::make_error_code(std::errc::invalid_argument));
+    return;
+  }
+  ErrorOr<CharSetConverter> ConvToEBCDIC =
+      CharSetConverter::create("UTF-32", "IBM-1047");
+  // Stop test if conversion is not supported (no underlying iconv support).
+  if (!ConvToEBCDIC) {
+    ASSERT_EQ(ConvToEBCDIC.getError(),
+              std::make_error_code(std::errc::invalid_argument));
+    return;
+  }
+
+  // Setup source string.
+  char SrcStr[256];
+  for (size_t I = 0; I < 256; ++I)
+    SrcStr[I] = (I + 1) % 256;
+
+  SmallString<99> Dst1Str, Dst2Str, Dst3Str;
+
+  std::error_code EC = ConvToUTF16->convert(StringRef(SrcStr), Dst1Str, true);
+  EXPECT_TRUE(!EC);
+  EC = ConvToUTF32->convert(Dst1Str, Dst2Str, true);
+  EXPECT_TRUE(!EC);
+  EC = ConvToEBCDIC->convert(Dst2Str, Dst3Str, true);
+  EXPECT_TRUE(!EC);
+  EXPECT_STREQ(SrcStr, static_cast<std::string>(Dst3Str).c_str());
+}
+
+TEST(CharSet, ShiftState2022) {
+  // Earth string.
+  StringRef Src(EarthUTF);
+  SmallString<64> Dst;
+
+  ErrorOr<CharSetConverter> ConvTo2022 =
+      CharSetConverter::create("UTF-8", "ISO-2022-JP");
+  // Stop test if conversion is not supported (no underlying iconv support).
+  if (!ConvTo2022) {
+    ASSERT_EQ(ConvTo2022.getError(),
+              std::make_error_code(std::errc::invalid_argument));
+    return;
+  }
+
+  // Check that the string is properly converted.
+  std::error_code EC = ConvTo2022->convert(Src, Dst, true);
+  EXPECT_TRUE(!EC);
+  EXPECT_STREQ(EarthISO2022, static_cast<std::string>(Dst).c_str());
+}
+
+TEST(CharSet, ShiftState2022Flush) {
+  StringRef Src0(EarthUTFBroken);
+  StringRef Src1(EarthKanjiOnlyUTF);
+  SmallString<64> Dst0;
+  SmallString<64> Dst1;
+  ErrorOr<CharSetConverter> ConvTo2022Flush =
+      CharSetConverter::create("UTF-8", "ISO-2022-JP");
+  if (!ConvTo2022Flush) {
+    ASSERT_EQ(ConvTo2022Flush.getError(),
+              std::make_error_code(std::errc::invalid_argument));
+    return;
+  }
+
+  // This should emit an error; there is a malformed multibyte character in the
+  // input string.
+  std::error_code EC0 = ConvTo2022Flush->convert(Src0, Dst0, true);
+  EXPECT_TRUE(EC0);
+  std::error_code EC1 = ConvTo2022Flush->flush();
+  EXPECT_TRUE(!EC1);
+  std::error_code EC2 = ConvTo2022Flush->convert(Src1, Dst1, true);
+  EXPECT_TRUE(!EC2);
+  EXPECT_STREQ(EarthKanjiOnlyISO2022, static_cast<std::string>(Dst1).c_str());
+}
+
+TEST(CharSet, ShiftStateIBM939) {
+  // Earth string.
+  StringRef Src(EarthUTF);
+  SmallString<64> Dst;
+
+  ErrorOr<CharSetConverter> ConvToIBM939 =
+      CharSetConverter::create("UTF-8", "IBM-939");
+  // Stop test if conversion is not supported (no underlying iconv support).
+  if (!ConvToIBM939) {
+    ASSERT_EQ(ConvToIBM939.getError(),
+              std::make_error_code(std::errc::invalid_argument));
+    return;
+  }
+
+  // Check that the string is properly converted.
+  std::error_code EC = ConvToIBM939->convert(Src, Dst, true);
+  EXPECT_TRUE(!EC);
+  EXPECT_STREQ(EarthIBM939, static_cast<std::string>(Dst).c_str());
+}
+
+TEST(CharSet, ShiftStateIBM939Flush) {
+  StringRef Src0(EarthUTFBroken);
+  StringRef Src1(EarthKanjiOnlyUTF);
+  SmallString<64> Dst0;
+  SmallString<64> Dst1;
+  ErrorOr<CharSetConverter> ConvTo939Flush =
+      CharSetConverter::create("UTF-8", "IBM-939");
+  if (!ConvTo939Flush) {
+    ASSERT_EQ(ConvTo939Flush.getError(),
+              std::make_error_code(std::errc::invalid_argument));
+    return;
+  }
+
+  // This should emit an error; there is a malformed multibyte character in the
+  // input string.
+  std::error_code EC0 = ConvTo939Flush->convert(Src0, Dst0, true);
+  EXPECT_TRUE(EC0);
+  std::error_code EC1 = ConvTo939Flush->flush();
+  EXPECT_TRUE(!EC1);
+  std::error_code EC2 = ConvTo939Flush->convert(Src1, Dst1, true);
+  EXPECT_TRUE(!EC2);
+  EXPECT_STREQ(EarthKanjiOnlyIBM939, static_cast<std::string>(Dst1).c_str());
+}
+
+TEST(CharSet, ShiftState2022Flush1) {
+  StringRef Src0(EarthUTF);
+  SmallString<64> Dst0;
+  SmallString<64> Dst1;
+  ErrorOr<CharSetConverter> ConvTo2022Flush =
+      CharSetConverter::create("UTF-8", "ISO-2022-JP");
+  if (!ConvTo2022Flush) {
+    ASSERT_EQ(ConvTo2022Flush.getError(),
+              std::make_error_code(std::errc::invalid_argument));
+    return;
+  }
+
+  std::error_code EC0 = ConvTo2022Flush->convert(Src0, Dst0, false);
+  EXPECT_TRUE(!EC0);
+  EXPECT_STREQ(EarthISO2022ShiftBack, static_cast<std::string>(Dst0).c_str());
+  std::error_code EC1 = ConvTo2022Flush->flush(Dst1);
+  EXPECT_TRUE(!EC1);
+  EXPECT_STREQ(ShiftBackOnly, static_cast<std::string>(Dst1).c_str());
+}
+
 } // namespace
Index: clang/lib/Basic/CharSet.cpp
===================================================================
--- clang/lib/Basic/CharSet.cpp
+++ clang/lib/Basic/CharSet.cpp
@@ -22,6 +22,10 @@
 #include <limits>
 #include <system_error>
 
+#ifdef HAVE_ICONV
+#include <iconv.h>
+#endif
+
 using namespace llvm;
 
 // Normalize the charset name with the charset alias matching algorithm proposed
@@ -97,6 +101,132 @@
   return std::error_code();
 }
 
+#ifdef HAVE_ICONV
+class CharSetConverterIconv : public details::CharSetConverterImplBase {
+  iconv_t ConvDesc;
+
+public:
+  CharSetConverterIconv(iconv_t ConvDesc) : ConvDesc(ConvDesc) {}
+
+  std::error_code convert(StringRef Source, SmallVectorImpl<char> &Result,
+                          bool ShouldAutoFlush) const override;
+  std::error_code flush() const override;
+  std::error_code flush(SmallVectorImpl<char> &Result) const override;
+};
+
+std::error_code CharSetConverterIconv::convert(StringRef Source,
+                                               SmallVectorImpl<char> &Result,
+                                               bool ShouldAutoFlush) const {
+  // Setup the input. Use nullptr to reset iconv state if input length is zero.
+  size_t InputLength = Source.size();
+  char *Input = InputLength ? const_cast<char *>(Source.data()) : nullptr;
+  // Setup the output. We directly write into the SmallVector.
+  size_t Capacity = Result.capacity();
+  Result.resize_for_overwrite(Capacity);
+  char *Output = InputLength ? static_cast<char *>(Result.data()) : nullptr;
+  size_t OutputLength = Capacity;
+
+  size_t Ret;
+
+  // Handle errors returned from iconv().
+  auto HandleError = [&Capacity, &Output, &OutputLength, &Result](size_t Ret) {
+    if (Ret == static_cast<size_t>(-1)) {
+      // An error occured. Check if we can gracefully handle it.
+      if (errno == E2BIG && Capacity < std::numeric_limits<size_t>::max()) {
+        // No space left in output buffer. Double the size of the underlying
+        // memory in the SmallVectorImpl, adjust pointer and length and continue
+        // the conversion.
+        const size_t Used = Capacity - OutputLength;
+        Capacity = (Capacity < std::numeric_limits<size_t>::max() / 2)
+                       ? 2 * Capacity
+                       : std::numeric_limits<size_t>::max();
+        Result.resize_for_overwrite(Capacity);
+        Output = static_cast<char *>(Result.data()) + Used;
+        OutputLength = Capacity - Used;
+        return std::error_code();
+      } else {
+        // Some other error occured.
+        return std::error_code(errno, std::generic_category());
+      }
+    } else {
+      // A positive return value indicates that some characters were converted
+      // in a nonreversible way, that is, replaced with a SUB symbol. Returning
+      // an error in this case makes sure that both conversion routines behave
+      // in the same way.
+      return std::make_error_code(std::errc::illegal_byte_sequence);
+    }
+  };
+
+  // Convert the string.
+  while ((Ret = iconv(ConvDesc, &Input, &InputLength, &Output, &OutputLength)))
+    if (auto EC = HandleError(Ret))
+      return EC;
+  if (ShouldAutoFlush) {
+    while ((Ret = iconv(ConvDesc, nullptr, nullptr, &Output, &OutputLength)))
+      if (auto EC = HandleError(Ret))
+        return EC;
+  }
+
+  // Re-adjust size to actual size.
+  Result.resize(Capacity - OutputLength);
+  return std::error_code();
+}
+
+std::error_code CharSetConverterIconv::flush() const {
+  size_t Ret = iconv(ConvDesc, nullptr, nullptr, nullptr, nullptr);
+  if (Ret == static_cast<size_t>(-1)) {
+    return std::error_code(errno, std::generic_category());
+  }
+  return std::error_code();
+}
+
+std::error_code
+CharSetConverterIconv::flush(SmallVectorImpl<char> &Result) const {
+  char *Output = Result.data();
+  size_t OutputLength = Result.capacity();
+  size_t Capacity = Result.capacity();
+  Result.resize_for_overwrite(Capacity);
+
+  // Handle errors returned from iconv().
+  auto HandleError = [&Capacity, &Output, &OutputLength, &Result](size_t Ret) {
+    if (Ret == static_cast<size_t>(-1)) {
+      // An error occured. Check if we can gracefully handle it.
+      if (errno == E2BIG && Capacity < std::numeric_limits<size_t>::max()) {
+        // No space left in output buffer. Increase the size of the underlying
+        // memory in the SmallVectorImpl by 2 bytes, adjust pointer and length
+        // and continue the conversion.
+        const size_t Used = Capacity - OutputLength;
+        Capacity = (Capacity < std::numeric_limits<size_t>::max() - 2)
+                       ? 2 + Capacity
+                       : std::numeric_limits<size_t>::max();
+        Result.resize_for_overwrite(Capacity);
+        Output = static_cast<char *>(Result.data()) + Used;
+        OutputLength = Capacity - Used;
+        return std::error_code();
+      } else {
+        // Some other error occured.
+        return std::error_code(errno, std::generic_category());
+      }
+    } else {
+      // A positive return value indicates that some characters were converted
+      // in a nonreversible way, that is, replaced with a SUB symbol. Returning
+      // an error in this case makes sure that both conversion routines behave
+      // in the same way.
+      return std::make_error_code(std::errc::illegal_byte_sequence);
+    }
+  };
+
+  size_t Ret;
+  while ((Ret = iconv(ConvDesc, nullptr, nullptr, &Output, &OutputLength)))
+    if (auto EC = HandleError(Ret))
+      return EC;
+
+  // Re-adjust size to actual size.
+  Result.resize(Capacity - OutputLength);
+  return std::error_code();
+}
+
+#endif // HAVE_ICONV
 } // namespace
 
 CharSetConverter CharSetConverter::create(text_encoding::id CPFrom,
@@ -120,5 +250,13 @@
   std::optional<text_encoding::id> To = getKnownCharSet(CSTo);
   if (From && To)
     return create(*From, *To);
+#if HAVE_ICONV
+  iconv_t ConvDesc = iconv_open(CSTo.str().c_str(), CSFrom.str().c_str());
+  if (ConvDesc == (iconv_t)-1)
+    return std::error_code(errno, std::generic_category());
+  std::unique_ptr<details::CharSetConverterImplBase> Converter =
+      std::make_unique<CharSetConverterIconv>(ConvDesc);
+  return CharSetConverter(std::move(Converter));
+#endif
   return std::make_error_code(std::errc::invalid_argument);
 }
Index: clang/lib/Basic/CMakeLists.txt
===================================================================
--- clang/lib/Basic/CMakeLists.txt
+++ clang/lib/Basic/CMakeLists.txt
@@ -51,6 +51,17 @@
     PROPERTIES COMPILE_DEFINITIONS "CLANG_VENDOR=\"${CLANG_VENDOR} \"")
 endif()
 
+# Link iconv library if it is an external library.
+find_package(Iconv)
+if(Iconv_FOUND)
+  set(HAVE_ICONV 1)
+else()
+  set(HAVE_ICONV 0)
+endif()
+if(Iconv_FOUND AND NOT Iconv_IS_BUILT_IN)
+  set(system_libs ${system_libs} ${Iconv_LIBRARIES})
+endif()
+
 add_clang_library(clangBasic
   Attributes.cpp
   Builtins.cpp
Index: clang/include/clang/Config/config.h.cmake
===================================================================
--- clang/include/clang/Config/config.h.cmake
+++ clang/include/clang/Config/config.h.cmake
@@ -57,6 +57,9 @@
 /* Define if we have sys/resource.h (rlimits) */
 #cmakedefine CLANG_HAVE_RLIMITS ${CLANG_HAVE_RLIMITS}
 
+/* Define if iconv library is available */
+#cmakedefine HAVE_ICONV ${HAVE_ICONV}
+
 /* Linker version detected at compile time. */
 #cmakedefine HOST_LINK_VERSION "${HOST_LINK_VERSION}"
 
Index: clang/include/clang/Basic/CharSet.h
===================================================================
--- clang/include/clang/Basic/CharSet.h
+++ clang/include/clang/Basic/CharSet.h
@@ -51,7 +51,10 @@
   /// In case of an error, the result string contains the successfully converted
   /// part of the input string.
   ///
-
+  /// If the Source parameter has a zero length, then no conversion is
+  /// performed. Instead, the internal conversation state of iconv is reset to
+  /// the initial state if iconv is used for the conversion. Otherwise it is a
+  /// no-op.
   virtual std::error_code convert(StringRef Source,
                                   SmallVectorImpl<char> &Result,
                                   bool ShouldAutoFlush) const = 0;
@@ -81,6 +84,8 @@
 
 /// Utility class to convert between different character set encodings.
 /// The class always supports converting between EBCDIC 1047 and Latin-1/UTF-8.
+/// If the iconv library is available, then arbitrary conversions are supported.
+/// TODO Add Windows support.
 class CharSetConverter {
   // details::CharSetConverterImplBase *Converter;
   std::unique_ptr<details::CharSetConverterImplBase> Converter;

_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[PATCH] D153418: Adding iconv support to CharSetConverter class

Reply via email to