Skip to content

Commit 2f1a16a

Browse files
authored
feat(string): add String::Split, EscapedStringPy, and rename EscapeString (#550)
## Summary - Rename `EscapeString` to `EscapeStringJSON` to clarify its JSON-specific escaping semantics (RFC 8259). A deprecated `EscapeString` alias is retained for backward compatibility. - Add `EscapedStringPy` for Python-style string escaping that handles ANSI escape sequences, UTF-8 multibyte characters, and standard C escape sequences (`\n`, `\t`, `\r`, `\\`, `\"`). - Add `String::Split(char delim)` utility method that returns `std::vector<std::string_view>` segments. - Update all internal call sites (`function.h`, `registry.h`, `dataclass.cc`, `json_writer.cc`) to use the new `EscapeStringJSON` name. - `ReprPrinter` now uses `EscapedStringPy` instead of `EscapeStringJSON` for proper Python-style `__repr__` output. ## Motivation The existing `EscapeString` function was JSON-specific but its name did not convey this. This rename makes intent explicit. The new `EscapedStringPy` function supports Python-style repr output needed for error messages and debugging. `String::Split` is a common utility needed across the codebase. ## Changes | File | Change | |------|--------| | `include/tvm/ffi/string.h` | Rename `EscapeString` -> `EscapeStringJSON`, add deprecated alias, add `EscapedStringPy`, add `String::Split` | | `include/tvm/ffi/string.h` | Cast to `unsigned char` before `std::isdigit` to avoid UB; use `\x1b` for ANSI escapes; validate UTF-8 continuation bytes | | `include/tvm/ffi/function.h` | Update call site to `EscapeStringJSON` | | `include/tvm/ffi/reflection/registry.h` | Update call site to `EscapeStringJSON` | | `src/ffi/extra/dataclass.cc` | `ReprPrinter` uses `EscapedStringPy` for Python-style repr output | | `src/ffi/extra/json_writer.cc` | Update call site to `EscapeStringJSON` | | `tests/cpp/test_string.cc` | Add 7 test cases for `Split`, `EscapeStringJSON`, `EscapedStringPy` (basic, control chars, ANSI, UTF-8, malformed UTF-8) | ## Test plan - [x] All 47 C++ string tests pass - [x] `String::Split` tested with edge cases (empty, boundaries, consecutive delimiters) - [x] `EscapeStringJSON` tested with special chars, backslash, quotes, control chars - [x] `EscapedStringPy` tested: basic ASCII, control chars, ANSI sequences, valid UTF-8 (2/3/4-byte), malformed UTF-8 - [x] Existing Python tests pass (deprecated alias preserves compatibility)
1 parent 528ce7c commit 2f1a16a

6 files changed

Lines changed: 273 additions & 7 deletions

File tree

include/tvm/ffi/function.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -999,7 +999,7 @@ inline int32_t TypeKeyToIndex(std::string_view type_key) {
999999
using FuncInfo = ::tvm::ffi::details::FunctionInfo<decltype(Function)>; \
10001000
std::ostringstream os; \
10011001
os << R"({"type_schema":)" \
1002-
<< ::tvm::ffi::EscapeString(::tvm::ffi::String(FuncInfo::TypeSchema())) << R"(})"; \
1002+
<< ::tvm::ffi::EscapeStringJSON(::tvm::ffi::String(FuncInfo::TypeSchema())) << R"(})"; \
10031003
std::string data = os.str(); \
10041004
TVMFFIByteArray data_array{data.data(), data.size()}; \
10051005
return TVMFFIStringFromByteArray(&data_array, result); \

include/tvm/ffi/reflection/registry.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -124,7 +124,7 @@ class Metadata : public InfoTrait {
124124
} else if (std::optional<bool> v = value.as<bool>()) {
125125
os << (*v ? "true" : "false");
126126
} else if (std::optional<String> v = value.as<String>()) {
127-
String escaped = EscapeString(*v);
127+
String escaped = EscapeStringJSON(*v);
128128
os << escaped.c_str();
129129
} else {
130130
TVM_FFI_LOG_AND_THROW(TypeError) << "Metadata can be only int, bool or string, but on key `"

include/tvm/ffi/string.h

Lines changed: 146 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,12 +30,15 @@
3030
#include <tvm/ffi/object.h>
3131
#include <tvm/ffi/type_traits.h>
3232

33+
#include <cctype>
3334
#include <cstddef>
3435
#include <cstring>
36+
#include <iomanip>
3537
#include <sstream>
3638
#include <string>
3739
#include <string_view>
3840
#include <utility>
41+
#include <vector>
3942

4043
// Note: We place string in tvm/ffi instead of tvm/ffi/container
4144
// because string itself needs special handling and is an inherent
@@ -736,6 +739,26 @@ class String {
736739
return std::string{data(), size()};
737740
}
738741

742+
/*!
743+
* \brief Split the string by a delimiter character.
744+
* \param delim The delimiter character.
745+
* \return A vector of string_views pointing into this string's data.
746+
* \note The returned string_views are only valid while this String is alive.
747+
*/
748+
std::vector<std::string_view> Split(char delim) const {
749+
std::vector<std::string_view> ret;
750+
const char* start = data();
751+
const char* end = start + size();
752+
for (const char* p = start; p < end; ++p) {
753+
if (*p == delim) {
754+
ret.emplace_back(start, static_cast<size_t>(p - start));
755+
start = p + 1;
756+
}
757+
}
758+
ret.emplace_back(start, static_cast<size_t>(end - start));
759+
return ret;
760+
}
761+
739762
private:
740763
template <typename, typename>
741764
friend struct TypeTraits;
@@ -802,11 +825,15 @@ class String {
802825
};
803826

804827
/*!
805-
* \brief Return an escaped version of the string
828+
* \brief Return a JSON-escaped version of the string (RFC 8259).
829+
*
830+
* Uses ``\\uXXXX`` for control characters, escapes ``\\/``, ``\\b``, ``\\f`` per the JSON spec.
831+
* Non-ASCII bytes are passed through as-is (valid UTF-8 is preserved).
832+
*
806833
* \param value The input string
807834
* \return The escaped string, quoted with double quotes
808835
*/
809-
inline String EscapeString(const String& value) {
836+
inline String EscapeStringJSON(const String& value) {
810837
std::ostringstream oss;
811838
oss << '"';
812839
const char* data = value.data();
@@ -847,6 +874,123 @@ inline String EscapeString(const String& value) {
847874
return String(oss.str());
848875
}
849876

877+
/*!
878+
* \brief Escape a string for JSON output.
879+
* \deprecated Use EscapeStringJSON instead.
880+
* \param value The input string
881+
* \return The escaped string, quoted with double quotes
882+
*/
883+
[[deprecated("Use EscapeStringJSON instead")]] inline String EscapeString(const String& value) {
884+
return EscapeStringJSON(value);
885+
}
886+
887+
/*!
888+
* \brief Return a Python-style escaped string representation.
889+
*
890+
* Handles ANSI escape sequences, UTF-8 multibyte characters, and standard
891+
* C escape sequences (\\n, \\t, \\r, \\\\, \\"). Uses \\xNN for control
892+
* characters and \\uXXXX / \\UXXXXXXXX for non-ASCII codepoints.
893+
*
894+
* \param value The input string to escape.
895+
* \return The escaped string, quoted with double quotes.
896+
*/
897+
inline String EscapedStringPy(const String& value) {
898+
const char* data = value.data();
899+
const size_t length = value.size();
900+
std::ostringstream oss;
901+
oss << '"';
902+
for (size_t i = 0; i < length;) {
903+
unsigned char c = static_cast<unsigned char>(data[i]);
904+
unsigned char d = (i + 1 < length) ? static_cast<unsigned char>(data[i + 1]) : 0;
905+
// Detect ANSI escape sequences
906+
if (c == '\x1b' && d == '[') {
907+
size_t j = i + 2;
908+
while (j < length && (std::isdigit(static_cast<unsigned char>(data[j])) || data[j] == ';')) {
909+
++j;
910+
}
911+
if (j < length && (data[j] == 'm' || data[j] == 'K')) {
912+
oss << "\\x1b[";
913+
for (i += 2; i <= j; ++i) {
914+
oss << data[i];
915+
}
916+
continue;
917+
}
918+
}
919+
// Handle ASCII C escape sequences
920+
switch (c) {
921+
case '\n':
922+
oss << "\\n";
923+
++i;
924+
continue;
925+
case '\t':
926+
oss << "\\t";
927+
++i;
928+
continue;
929+
case '\r':
930+
oss << "\\r";
931+
++i;
932+
continue;
933+
case '\\':
934+
oss << "\\\\";
935+
++i;
936+
continue;
937+
case '\"':
938+
oss << "\\\"";
939+
++i;
940+
continue;
941+
default:
942+
break;
943+
}
944+
// Handle ASCII
945+
if ((c & 0x80) == 0) {
946+
if (c < 0x20 || c == 0x7f) {
947+
// Escape control characters as \xNN
948+
char buf[5];
949+
TVM_FFI_SNPRINTF(buf, sizeof(buf), "\\x%02x", static_cast<unsigned>(c));
950+
oss << buf;
951+
} else {
952+
oss << static_cast<char>(c);
953+
}
954+
++i;
955+
continue;
956+
}
957+
if ((c & 0xE0) == 0xC0 && i + 1 < length && (d & 0xC0) == 0x80) {
958+
int32_t codepoint = ((c & 0x1F) << 6) | (d & 0x3F);
959+
oss << "\\u" << std::hex << std::setw(4) << std::setfill('0') << codepoint;
960+
i += 2;
961+
} else if ((c & 0xF0) == 0xE0 && i + 2 < length) {
962+
unsigned char e = static_cast<unsigned char>(data[i + 2]);
963+
if ((d & 0xC0) == 0x80 && (e & 0xC0) == 0x80) {
964+
int32_t codepoint = ((c & 0x0F) << 12) | ((d & 0x3F) << 6) | (e & 0x3F);
965+
oss << "\\u" << std::hex << std::setw(4) << std::setfill('0') << codepoint;
966+
i += 3;
967+
} else {
968+
oss << "\\x" << std::hex << std::setw(2) << std::setfill('0') << static_cast<int>(c);
969+
++i;
970+
}
971+
} else if ((c & 0xF8) == 0xF0 && i + 3 < length) {
972+
unsigned char e = static_cast<unsigned char>(data[i + 2]);
973+
unsigned char f = static_cast<unsigned char>(data[i + 3]);
974+
if ((d & 0xC0) == 0x80 && (e & 0xC0) == 0x80 && (f & 0xC0) == 0x80) {
975+
int32_t codepoint =
976+
((c & 0x07) << 18) | ((d & 0x3F) << 12) | ((e & 0x3F) << 6) | (f & 0x3F);
977+
oss << "\\U" << std::hex << std::setw(8) << std::setfill('0') << codepoint;
978+
i += 4;
979+
} else {
980+
oss << "\\x" << std::hex << std::setw(2) << std::setfill('0') << static_cast<int>(c);
981+
++i;
982+
}
983+
} else {
984+
oss << "\\x" << std::hex << std::setw(2) << std::setfill('0') << static_cast<int>(c);
985+
++i;
986+
}
987+
oss.unsetf(std::ios::adjustfield | std::ios::basefield | std::ios::floatfield);
988+
oss.fill(' ');
989+
}
990+
oss << '"';
991+
return String(oss.str());
992+
}
993+
850994
/*! \brief Convert TVMFFIByteArray to std::string_view */
851995
TVM_FFI_INLINE std::string_view ToStringView(TVMFFIByteArray str) {
852996
return std::string_view(str.data, str.size);

src/ffi/extra/dataclass.cc

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -779,7 +779,7 @@ class ReprPrinter : public ObjectGraphDFS<ReprPrinter, ReprFrame, std::string> {
779779
}
780780
if (ti == TypeIndex::kTVMFFISmallStr) {
781781
String s = value.cast<String>();
782-
String escaped = EscapeString(s);
782+
String escaped = EscapedStringPy(s);
783783
*out = std::string(escaped.data(), escaped.size());
784784
return true;
785785
}
@@ -812,7 +812,7 @@ class ReprPrinter : public ObjectGraphDFS<ReprPrinter, ReprFrame, std::string> {
812812
// String/Bytes on heap
813813
if (ti == TypeIndex::kTVMFFIStr) {
814814
String s = details::AnyUnsafe::CopyFromAnyViewAfterCheck<String>(value);
815-
String escaped = EscapeString(s);
815+
String escaped = EscapedStringPy(s);
816816
*out = std::string(escaped.data(), escaped.size());
817817
return true;
818818
}

src/ffi/extra/json_writer.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -186,7 +186,7 @@ class JSONWriter {
186186
}
187187

188188
void WriteString(const String& value) {
189-
String escaped = EscapeString(value);
189+
String escaped = EscapeStringJSON(value);
190190
std::copy(escaped.data(), escaped.data() + escaped.size(), out_iter_);
191191
}
192192

tests/cpp/test_string.cc

Lines changed: 122 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -528,4 +528,126 @@ TEST(String, EndsWith) {
528528
EXPECT_FALSE(single.ends_with("yx"));
529529
}
530530

531+
TEST(String, Split) {
532+
String s{"a,b,c"};
533+
auto parts = s.Split(',');
534+
ASSERT_EQ(parts.size(), 3);
535+
EXPECT_EQ(parts[0], "a");
536+
EXPECT_EQ(parts[1], "b");
537+
EXPECT_EQ(parts[2], "c");
538+
539+
// No delimiter present
540+
String s2{"hello"};
541+
auto parts2 = s2.Split(',');
542+
ASSERT_EQ(parts2.size(), 1);
543+
EXPECT_EQ(parts2[0], "hello");
544+
545+
// Empty string
546+
String s3{""};
547+
auto parts3 = s3.Split(',');
548+
ASSERT_EQ(parts3.size(), 1);
549+
EXPECT_EQ(parts3[0], "");
550+
551+
// Delimiter at boundaries
552+
String s4{",a,b,"};
553+
auto parts4 = s4.Split(',');
554+
ASSERT_EQ(parts4.size(), 4);
555+
EXPECT_EQ(parts4[0], "");
556+
EXPECT_EQ(parts4[1], "a");
557+
EXPECT_EQ(parts4[2], "b");
558+
EXPECT_EQ(parts4[3], "");
559+
560+
// Consecutive delimiters
561+
String s5{"a,,b"};
562+
auto parts5 = s5.Split(',');
563+
ASSERT_EQ(parts5.size(), 3);
564+
EXPECT_EQ(parts5[0], "a");
565+
EXPECT_EQ(parts5[1], "");
566+
EXPECT_EQ(parts5[2], "b");
567+
}
568+
569+
TEST(String, EscapeStringJSON) {
570+
// Basic escaping
571+
String s1{"hello"};
572+
EXPECT_EQ(EscapeStringJSON(s1), "\"hello\"");
573+
574+
// Special characters
575+
String s2{"line1\nline2\ttab"};
576+
EXPECT_EQ(EscapeStringJSON(s2), "\"line1\\nline2\\ttab\"");
577+
578+
// Backslash and quote
579+
String s3{"a\\b\"c"};
580+
EXPECT_EQ(EscapeStringJSON(s3), "\"a\\\\b\\\"c\"");
581+
582+
// Control characters
583+
String s4{std::string("a\x01\x1f z", 5)};
584+
EXPECT_EQ(EscapeStringJSON(s4), "\"a\\u0001\\u001f z\"");
585+
}
586+
587+
TEST(String, EscapedStringPyBasic) {
588+
// Plain ASCII
589+
String s1{"hello world"};
590+
EXPECT_EQ(EscapedStringPy(s1), "\"hello world\"");
591+
592+
// C escape sequences
593+
String s2{"a\nb\tc\r"};
594+
EXPECT_EQ(EscapedStringPy(s2), "\"a\\nb\\tc\\r\"");
595+
596+
// Backslash and quote
597+
String s3{"a\\b\"c"};
598+
EXPECT_EQ(EscapedStringPy(s3), "\"a\\\\b\\\"c\"");
599+
}
600+
601+
TEST(String, EscapedStringPyControlChars) {
602+
// Control characters -> \xNN
603+
String s1{std::string("\x01\x02\x7f", 3)};
604+
String result = EscapedStringPy(s1);
605+
EXPECT_EQ(result, "\"\\x01\\x02\\x7f\"");
606+
}
607+
608+
TEST(String, EscapedStringPyANSI) {
609+
// ANSI escape: ESC[31m (red)
610+
String s1{std::string("\x1b[31mred\x1b[0m", 12)};
611+
String result = EscapedStringPy(s1);
612+
EXPECT_EQ(result, "\"\\x1b[31mred\\x1b[0m\"");
613+
614+
// ANSI erase line: ESC[K
615+
String s2{std::string("\x1b[K", 3)};
616+
EXPECT_EQ(EscapedStringPy(s2), "\"\\x1b[K\"");
617+
}
618+
619+
TEST(String, EscapedStringPyUTF8) {
620+
// 2-byte: U+00E9 (é) = C3 A9
621+
String s1{std::string("\xc3\xa9", 2)};
622+
EXPECT_EQ(EscapedStringPy(s1), "\"\\u00e9\"");
623+
624+
// 3-byte: U+4E16 (世) = E4 B8 96
625+
String s2{std::string("\xe4\xb8\x96", 3)};
626+
EXPECT_EQ(EscapedStringPy(s2), "\"\\u4e16\"");
627+
628+
// 4-byte: U+1F600 (😀) = F0 9F 98 80
629+
String s3{std::string("\xf0\x9f\x98\x80", 4)};
630+
EXPECT_EQ(EscapedStringPy(s3), "\"\\U0001f600\"");
631+
}
632+
633+
TEST(String, EscapedStringPyMalformedUTF8) {
634+
// Lone continuation byte -> \xNN fallback
635+
String s1{std::string("\x80", 1)};
636+
EXPECT_EQ(EscapedStringPy(s1), "\"\\x80\"");
637+
638+
// 2-byte leader followed by non-continuation -> fallback for leader
639+
String s2{std::string("\xc3\x20", 2)};
640+
String result2 = EscapedStringPy(s2);
641+
EXPECT_EQ(result2, "\"\\xc3 \"");
642+
643+
// 3-byte leader with bad continuation -> fallback for leader
644+
String s3{std::string("\xe4\xb8\x20", 3)};
645+
String result3 = EscapedStringPy(s3);
646+
EXPECT_EQ(result3, "\"\\xe4\\xb8 \"");
647+
648+
// Truncated 2-byte at end of string
649+
String s4{std::string("\xc3", 1)};
650+
EXPECT_EQ(EscapedStringPy(s4), "\"\\xc3\"");
651+
}
652+
531653
} // namespace

0 commit comments

Comments
 (0)