This is an automated email from the ASF dual-hosted git repository.

junrushao pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm-ffi.git


The following commit(s) were added to refs/heads/main by this push:
     new 2f1a16a  feat(string): add String::Split, EscapedStringPy, and rename 
EscapeString (#550)
2f1a16a is described below

commit 2f1a16a4d41fb05e0fc313ed20a3df35a9a9bca8
Author: Junru Shao <[email protected]>
AuthorDate: Tue Apr 14 14:59:28 2026 -0700

    feat(string): add String::Split, EscapedStringPy, and rename EscapeString 
(#550)
    
    ## Summary
    
    - Rename `EscapeString` to `EscapeStringJSON` to clarify its
    JSON-specific escaping semantics (RFC 8259). A deprecated `EscapeString`
    alias is retained for backward compatibility.
    - Add `EscapedStringPy` for Python-style string escaping that handles
    ANSI escape sequences, UTF-8 multibyte characters, and standard C escape
    sequences (`\n`, `\t`, `\r`, `\\`, `\"`).
    - Add `String::Split(char delim)` utility method that returns
    `std::vector<std::string_view>` segments.
    - Update all internal call sites (`function.h`, `registry.h`,
    `dataclass.cc`, `json_writer.cc`) to use the new `EscapeStringJSON`
    name.
    - `ReprPrinter` now uses `EscapedStringPy` instead of `EscapeStringJSON`
    for proper Python-style `__repr__` output.
    
    ## Motivation
    
    The existing `EscapeString` function was JSON-specific but its name did
    not convey this. This rename makes intent explicit. The new
    `EscapedStringPy` function supports Python-style repr output needed for
    error messages and debugging. `String::Split` is a common utility needed
    across the codebase.
    
    ## Changes
    
    | File | Change |
    |------|--------|
    | `include/tvm/ffi/string.h` | Rename `EscapeString` ->
    `EscapeStringJSON`, add deprecated alias, add `EscapedStringPy`, add
    `String::Split` |
    | `include/tvm/ffi/string.h` | Cast to `unsigned char` before
    `std::isdigit` to avoid UB; use `\x1b` for ANSI escapes; validate UTF-8
    continuation bytes |
    | `include/tvm/ffi/function.h` | Update call site to `EscapeStringJSON`
    |
    | `include/tvm/ffi/reflection/registry.h` | Update call site to
    `EscapeStringJSON` |
    | `src/ffi/extra/dataclass.cc` | `ReprPrinter` uses `EscapedStringPy`
    for Python-style repr output |
    | `src/ffi/extra/json_writer.cc` | Update call site to
    `EscapeStringJSON` |
    | `tests/cpp/test_string.cc` | Add 7 test cases for `Split`,
    `EscapeStringJSON`, `EscapedStringPy` (basic, control chars, ANSI,
    UTF-8, malformed UTF-8) |
    
    ## Test plan
    
    - [x] All 47 C++ string tests pass
    - [x] `String::Split` tested with edge cases (empty, boundaries,
    consecutive delimiters)
    - [x] `EscapeStringJSON` tested with special chars, backslash, quotes,
    control chars
    - [x] `EscapedStringPy` tested: basic ASCII, control chars, ANSI
    sequences, valid UTF-8 (2/3/4-byte), malformed UTF-8
    - [x] Existing Python tests pass (deprecated alias preserves
    compatibility)
---
 include/tvm/ffi/function.h            |   2 +-
 include/tvm/ffi/reflection/registry.h |   2 +-
 include/tvm/ffi/string.h              | 148 +++++++++++++++++++++++++++++++++-
 src/ffi/extra/dataclass.cc            |   4 +-
 src/ffi/extra/json_writer.cc          |   2 +-
 tests/cpp/test_string.cc              | 122 ++++++++++++++++++++++++++++
 6 files changed, 273 insertions(+), 7 deletions(-)

diff --git a/include/tvm/ffi/function.h b/include/tvm/ffi/function.h
index 2ee1a0d..4ec0e00 100644
--- a/include/tvm/ffi/function.h
+++ b/include/tvm/ffi/function.h
@@ -999,7 +999,7 @@ inline int32_t TypeKeyToIndex(std::string_view type_key) {
     using FuncInfo = ::tvm::ffi::details::FunctionInfo<decltype(Function)>;    
                  \
     std::ostringstream os;                                                     
                  \
     os << R"({"type_schema":)"                                                 
                  \
-       << ::tvm::ffi::EscapeString(::tvm::ffi::String(FuncInfo::TypeSchema())) 
<< R"(})";        \
+       << 
::tvm::ffi::EscapeStringJSON(::tvm::ffi::String(FuncInfo::TypeSchema())) << 
R"(})";    \
     std::string data = os.str();                                               
                  \
     TVMFFIByteArray data_array{data.data(), data.size()};                      
                  \
     return TVMFFIStringFromByteArray(&data_array, result);                     
                  \
diff --git a/include/tvm/ffi/reflection/registry.h 
b/include/tvm/ffi/reflection/registry.h
index 8f7c68c..08570f8 100644
--- a/include/tvm/ffi/reflection/registry.h
+++ b/include/tvm/ffi/reflection/registry.h
@@ -124,7 +124,7 @@ class Metadata : public InfoTrait {
       } else if (std::optional<bool> v = value.as<bool>()) {
         os << (*v ? "true" : "false");
       } else if (std::optional<String> v = value.as<String>()) {
-        String escaped = EscapeString(*v);
+        String escaped = EscapeStringJSON(*v);
         os << escaped.c_str();
       } else {
         TVM_FFI_LOG_AND_THROW(TypeError) << "Metadata can be only int, bool or 
string, but on key `"
diff --git a/include/tvm/ffi/string.h b/include/tvm/ffi/string.h
index 3a3ffcd..4769957 100644
--- a/include/tvm/ffi/string.h
+++ b/include/tvm/ffi/string.h
@@ -30,12 +30,15 @@
 #include <tvm/ffi/object.h>
 #include <tvm/ffi/type_traits.h>
 
+#include <cctype>
 #include <cstddef>
 #include <cstring>
+#include <iomanip>
 #include <sstream>
 #include <string>
 #include <string_view>
 #include <utility>
+#include <vector>
 
 // Note: We place string in tvm/ffi instead of tvm/ffi/container
 // because string itself needs special handling and is an inherent
@@ -736,6 +739,26 @@ class String {
     return std::string{data(), size()};
   }
 
+  /*!
+   * \brief Split the string by a delimiter character.
+   * \param delim The delimiter character.
+   * \return A vector of string_views pointing into this string's data.
+   * \note The returned string_views are only valid while this String is alive.
+   */
+  std::vector<std::string_view> Split(char delim) const {
+    std::vector<std::string_view> ret;
+    const char* start = data();
+    const char* end = start + size();
+    for (const char* p = start; p < end; ++p) {
+      if (*p == delim) {
+        ret.emplace_back(start, static_cast<size_t>(p - start));
+        start = p + 1;
+      }
+    }
+    ret.emplace_back(start, static_cast<size_t>(end - start));
+    return ret;
+  }
+
  private:
   template <typename, typename>
   friend struct TypeTraits;
@@ -802,11 +825,15 @@ class String {
 };
 
 /*!
- * \brief Return an escaped version of the string
+ * \brief Return a JSON-escaped version of the string (RFC 8259).
+ *
+ * Uses ``\\uXXXX`` for control characters, escapes ``\\/``, ``\\b``, ``\\f`` 
per the JSON spec.
+ * Non-ASCII bytes are passed through as-is (valid UTF-8 is preserved).
+ *
  * \param value The input string
  * \return The escaped string, quoted with double quotes
  */
-inline String EscapeString(const String& value) {
+inline String EscapeStringJSON(const String& value) {
   std::ostringstream oss;
   oss << '"';
   const char* data = value.data();
@@ -847,6 +874,123 @@ inline String EscapeString(const String& value) {
   return String(oss.str());
 }
 
+/*!
+ * \brief Escape a string for JSON output.
+ * \deprecated Use EscapeStringJSON instead.
+ * \param value The input string
+ * \return The escaped string, quoted with double quotes
+ */
+[[deprecated("Use EscapeStringJSON instead")]] inline String 
EscapeString(const String& value) {
+  return EscapeStringJSON(value);
+}
+
+/*!
+ * \brief Return a Python-style escaped string representation.
+ *
+ * Handles ANSI escape sequences, UTF-8 multibyte characters, and standard
+ * C escape sequences (\\n, \\t, \\r, \\\\, \\"). Uses \\xNN for control
+ * characters and \\uXXXX / \\UXXXXXXXX for non-ASCII codepoints.
+ *
+ * \param value The input string to escape.
+ * \return The escaped string, quoted with double quotes.
+ */
+inline String EscapedStringPy(const String& value) {
+  const char* data = value.data();
+  const size_t length = value.size();
+  std::ostringstream oss;
+  oss << '"';
+  for (size_t i = 0; i < length;) {
+    unsigned char c = static_cast<unsigned char>(data[i]);
+    unsigned char d = (i + 1 < length) ? static_cast<unsigned char>(data[i + 
1]) : 0;
+    // Detect ANSI escape sequences
+    if (c == '\x1b' && d == '[') {
+      size_t j = i + 2;
+      while (j < length && (std::isdigit(static_cast<unsigned char>(data[j])) 
|| data[j] == ';')) {
+        ++j;
+      }
+      if (j < length && (data[j] == 'm' || data[j] == 'K')) {
+        oss << "\\x1b[";
+        for (i += 2; i <= j; ++i) {
+          oss << data[i];
+        }
+        continue;
+      }
+    }
+    // Handle ASCII C escape sequences
+    switch (c) {
+      case '\n':
+        oss << "\\n";
+        ++i;
+        continue;
+      case '\t':
+        oss << "\\t";
+        ++i;
+        continue;
+      case '\r':
+        oss << "\\r";
+        ++i;
+        continue;
+      case '\\':
+        oss << "\\\\";
+        ++i;
+        continue;
+      case '\"':
+        oss << "\\\"";
+        ++i;
+        continue;
+      default:
+        break;
+    }
+    // Handle ASCII
+    if ((c & 0x80) == 0) {
+      if (c < 0x20 || c == 0x7f) {
+        // Escape control characters as \xNN
+        char buf[5];
+        TVM_FFI_SNPRINTF(buf, sizeof(buf), "\\x%02x", 
static_cast<unsigned>(c));
+        oss << buf;
+      } else {
+        oss << static_cast<char>(c);
+      }
+      ++i;
+      continue;
+    }
+    if ((c & 0xE0) == 0xC0 && i + 1 < length && (d & 0xC0) == 0x80) {
+      int32_t codepoint = ((c & 0x1F) << 6) | (d & 0x3F);
+      oss << "\\u" << std::hex << std::setw(4) << std::setfill('0') << 
codepoint;
+      i += 2;
+    } else if ((c & 0xF0) == 0xE0 && i + 2 < length) {
+      unsigned char e = static_cast<unsigned char>(data[i + 2]);
+      if ((d & 0xC0) == 0x80 && (e & 0xC0) == 0x80) {
+        int32_t codepoint = ((c & 0x0F) << 12) | ((d & 0x3F) << 6) | (e & 
0x3F);
+        oss << "\\u" << std::hex << std::setw(4) << std::setfill('0') << 
codepoint;
+        i += 3;
+      } else {
+        oss << "\\x" << std::hex << std::setw(2) << std::setfill('0') << 
static_cast<int>(c);
+        ++i;
+      }
+    } else if ((c & 0xF8) == 0xF0 && i + 3 < length) {
+      unsigned char e = static_cast<unsigned char>(data[i + 2]);
+      unsigned char f = static_cast<unsigned char>(data[i + 3]);
+      if ((d & 0xC0) == 0x80 && (e & 0xC0) == 0x80 && (f & 0xC0) == 0x80) {
+        int32_t codepoint =
+            ((c & 0x07) << 18) | ((d & 0x3F) << 12) | ((e & 0x3F) << 6) | (f & 
0x3F);
+        oss << "\\U" << std::hex << std::setw(8) << std::setfill('0') << 
codepoint;
+        i += 4;
+      } else {
+        oss << "\\x" << std::hex << std::setw(2) << std::setfill('0') << 
static_cast<int>(c);
+        ++i;
+      }
+    } else {
+      oss << "\\x" << std::hex << std::setw(2) << std::setfill('0') << 
static_cast<int>(c);
+      ++i;
+    }
+    oss.unsetf(std::ios::adjustfield | std::ios::basefield | 
std::ios::floatfield);
+    oss.fill(' ');
+  }
+  oss << '"';
+  return String(oss.str());
+}
+
 /*! \brief Convert TVMFFIByteArray to std::string_view */
 TVM_FFI_INLINE std::string_view ToStringView(TVMFFIByteArray str) {
   return std::string_view(str.data, str.size);
diff --git a/src/ffi/extra/dataclass.cc b/src/ffi/extra/dataclass.cc
index 4a2cc2c..23e7728 100644
--- a/src/ffi/extra/dataclass.cc
+++ b/src/ffi/extra/dataclass.cc
@@ -779,7 +779,7 @@ class ReprPrinter : public ObjectGraphDFS<ReprPrinter, 
ReprFrame, std::string> {
     }
     if (ti == TypeIndex::kTVMFFISmallStr) {
       String s = value.cast<String>();
-      String escaped = EscapeString(s);
+      String escaped = EscapedStringPy(s);
       *out = std::string(escaped.data(), escaped.size());
       return true;
     }
@@ -812,7 +812,7 @@ class ReprPrinter : public ObjectGraphDFS<ReprPrinter, 
ReprFrame, std::string> {
     // String/Bytes on heap
     if (ti == TypeIndex::kTVMFFIStr) {
       String s = details::AnyUnsafe::CopyFromAnyViewAfterCheck<String>(value);
-      String escaped = EscapeString(s);
+      String escaped = EscapedStringPy(s);
       *out = std::string(escaped.data(), escaped.size());
       return true;
     }
diff --git a/src/ffi/extra/json_writer.cc b/src/ffi/extra/json_writer.cc
index 240360d..77f4411 100644
--- a/src/ffi/extra/json_writer.cc
+++ b/src/ffi/extra/json_writer.cc
@@ -186,7 +186,7 @@ class JSONWriter {
   }
 
   void WriteString(const String& value) {
-    String escaped = EscapeString(value);
+    String escaped = EscapeStringJSON(value);
     std::copy(escaped.data(), escaped.data() + escaped.size(), out_iter_);
   }
 
diff --git a/tests/cpp/test_string.cc b/tests/cpp/test_string.cc
index 020cc41..f2b5005 100644
--- a/tests/cpp/test_string.cc
+++ b/tests/cpp/test_string.cc
@@ -528,4 +528,126 @@ TEST(String, EndsWith) {
   EXPECT_FALSE(single.ends_with("yx"));
 }
 
+TEST(String, Split) {
+  String s{"a,b,c"};
+  auto parts = s.Split(',');
+  ASSERT_EQ(parts.size(), 3);
+  EXPECT_EQ(parts[0], "a");
+  EXPECT_EQ(parts[1], "b");
+  EXPECT_EQ(parts[2], "c");
+
+  // No delimiter present
+  String s2{"hello"};
+  auto parts2 = s2.Split(',');
+  ASSERT_EQ(parts2.size(), 1);
+  EXPECT_EQ(parts2[0], "hello");
+
+  // Empty string
+  String s3{""};
+  auto parts3 = s3.Split(',');
+  ASSERT_EQ(parts3.size(), 1);
+  EXPECT_EQ(parts3[0], "");
+
+  // Delimiter at boundaries
+  String s4{",a,b,"};
+  auto parts4 = s4.Split(',');
+  ASSERT_EQ(parts4.size(), 4);
+  EXPECT_EQ(parts4[0], "");
+  EXPECT_EQ(parts4[1], "a");
+  EXPECT_EQ(parts4[2], "b");
+  EXPECT_EQ(parts4[3], "");
+
+  // Consecutive delimiters
+  String s5{"a,,b"};
+  auto parts5 = s5.Split(',');
+  ASSERT_EQ(parts5.size(), 3);
+  EXPECT_EQ(parts5[0], "a");
+  EXPECT_EQ(parts5[1], "");
+  EXPECT_EQ(parts5[2], "b");
+}
+
+TEST(String, EscapeStringJSON) {
+  // Basic escaping
+  String s1{"hello"};
+  EXPECT_EQ(EscapeStringJSON(s1), "\"hello\"");
+
+  // Special characters
+  String s2{"line1\nline2\ttab"};
+  EXPECT_EQ(EscapeStringJSON(s2), "\"line1\\nline2\\ttab\"");
+
+  // Backslash and quote
+  String s3{"a\\b\"c"};
+  EXPECT_EQ(EscapeStringJSON(s3), "\"a\\\\b\\\"c\"");
+
+  // Control characters
+  String s4{std::string("a\x01\x1f z", 5)};
+  EXPECT_EQ(EscapeStringJSON(s4), "\"a\\u0001\\u001f z\"");
+}
+
+TEST(String, EscapedStringPyBasic) {
+  // Plain ASCII
+  String s1{"hello world"};
+  EXPECT_EQ(EscapedStringPy(s1), "\"hello world\"");
+
+  // C escape sequences
+  String s2{"a\nb\tc\r"};
+  EXPECT_EQ(EscapedStringPy(s2), "\"a\\nb\\tc\\r\"");
+
+  // Backslash and quote
+  String s3{"a\\b\"c"};
+  EXPECT_EQ(EscapedStringPy(s3), "\"a\\\\b\\\"c\"");
+}
+
+TEST(String, EscapedStringPyControlChars) {
+  // Control characters -> \xNN
+  String s1{std::string("\x01\x02\x7f", 3)};
+  String result = EscapedStringPy(s1);
+  EXPECT_EQ(result, "\"\\x01\\x02\\x7f\"");
+}
+
+TEST(String, EscapedStringPyANSI) {
+  // ANSI escape: ESC[31m (red)
+  String s1{std::string("\x1b[31mred\x1b[0m", 12)};
+  String result = EscapedStringPy(s1);
+  EXPECT_EQ(result, "\"\\x1b[31mred\\x1b[0m\"");
+
+  // ANSI erase line: ESC[K
+  String s2{std::string("\x1b[K", 3)};
+  EXPECT_EQ(EscapedStringPy(s2), "\"\\x1b[K\"");
+}
+
+TEST(String, EscapedStringPyUTF8) {
+  // 2-byte: U+00E9 (é) = C3 A9
+  String s1{std::string("\xc3\xa9", 2)};
+  EXPECT_EQ(EscapedStringPy(s1), "\"\\u00e9\"");
+
+  // 3-byte: U+4E16 (世) = E4 B8 96
+  String s2{std::string("\xe4\xb8\x96", 3)};
+  EXPECT_EQ(EscapedStringPy(s2), "\"\\u4e16\"");
+
+  // 4-byte: U+1F600 (😀) = F0 9F 98 80
+  String s3{std::string("\xf0\x9f\x98\x80", 4)};
+  EXPECT_EQ(EscapedStringPy(s3), "\"\\U0001f600\"");
+}
+
+TEST(String, EscapedStringPyMalformedUTF8) {
+  // Lone continuation byte -> \xNN fallback
+  String s1{std::string("\x80", 1)};
+  EXPECT_EQ(EscapedStringPy(s1), "\"\\x80\"");
+
+  // 2-byte leader followed by non-continuation -> fallback for leader
+  String s2{std::string("\xc3\x20", 2)};
+  String result2 = EscapedStringPy(s2);
+  EXPECT_EQ(result2, "\"\\xc3 \"");
+
+  // 3-byte leader with bad continuation -> fallback for leader
+  String s3{std::string("\xe4\xb8\x20", 3)};
+  String result3 = EscapedStringPy(s3);
+  EXPECT_EQ(result3, "\"\\xe4\\xb8 \"");
+
+  // Truncated 2-byte at end of string
+  String s4{std::string("\xc3", 1)};
+  EXPECT_EQ(EscapedStringPy(s4), "\"\\xc3\"");
+}
+
 }  // namespace

Reply via email to