This is an automated email from the ASF dual-hosted git repository.
apitrou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new 7dacbd0484 GH-48941: [C++] Generate proper UTF-8 strings in JSON test
utilities (#48943)
7dacbd0484 is described below
commit 7dacbd04847385cef6543bc00a3612d09f839cac
Author: Hyukjin Kwon <[email protected]>
AuthorDate: Wed Feb 4 03:13:30 2026 +0900
GH-48941: [C++] Generate proper UTF-8 strings in JSON test utilities
(#48943)
### Rationale for this change
The JSON test utility `GenerateAscii` was only generating ASCII characters.
Should better have the test coverage for proper UTF-8 and Unicode handling.
### What changes are included in this PR?
Replaced ASCII-only generation with proper UTF-8 string generation that
produces valid Unicode scalar values across all planes (BMP, SMP, SIP, planes
3-16), correctly encoded per RFC 3629.
Added that function as an util.
### Are these changes tested?
There are existent tests for JSON.
### Are there any user-facing changes?
No, test-only.
* GitHub Issue: #48941
Authored-by: Hyukjin Kwon <[email protected]>
Signed-off-by: Antoine Pitrou <[email protected]>
---
cpp/src/arrow/json/test_common.h | 16 ++++-----
cpp/src/arrow/testing/random.cc | 77 ++++++++++++++++++++++++++++++++++++++++
cpp/src/arrow/testing/random.h | 15 ++++++++
3 files changed, 100 insertions(+), 8 deletions(-)
diff --git a/cpp/src/arrow/json/test_common.h b/cpp/src/arrow/json/test_common.h
index 423a0123c0..ab2ce9cdc7 100644
--- a/cpp/src/arrow/json/test_common.h
+++ b/cpp/src/arrow/json/test_common.h
@@ -33,6 +33,7 @@
#include "arrow/json/parser.h"
#include "arrow/json/rapidjson_defs.h"
#include "arrow/testing/gtest_util.h"
+#include "arrow/testing/random.h"
#include "arrow/type.h"
#include "arrow/util/checked_cast.h"
#include "arrow/visit_type_inline.h"
@@ -110,20 +111,19 @@ struct GenerateImpl {
return OK(writer.Double(val));
}
- Status GenerateAscii(const DataType&) {
- auto size = std::poisson_distribution<>{4}(e);
- std::uniform_int_distribution<uint16_t> gen_char(32, 126); // FIXME
generate UTF8
- std::string s(size, '\0');
- for (char& ch : s) ch = static_cast<char>(gen_char(e));
- return OK(writer.String(s.c_str()));
+ Status GenerateUtf8(const DataType&) {
+ auto num_codepoints = std::poisson_distribution<>{4}(e);
+ auto seed = std::uniform_int_distribution<uint32_t>{}(e);
+ std::string s = RandomUtf8String(seed, num_codepoints);
+ return OK(writer.String(s));
}
template <typename T>
enable_if_base_binary<T, Status> Visit(const T& t) {
- return GenerateAscii(t);
+ return GenerateUtf8(t);
}
- Status Visit(const BinaryViewType& t) { return GenerateAscii(t); }
+ Status Visit(const BinaryViewType& t) { return GenerateUtf8(t); }
template <typename T>
enable_if_list_like<T, Status> Visit(const T& t) {
diff --git a/cpp/src/arrow/testing/random.cc b/cpp/src/arrow/testing/random.cc
index c50387e490..f73dbd5bbf 100644
--- a/cpp/src/arrow/testing/random.cc
+++ b/cpp/src/arrow/testing/random.cc
@@ -1475,4 +1475,81 @@ void rand_month_day_nanos(int64_t N,
});
}
+std::string RandomUtf8String(random::SeedType seed, int num_chars) {
+ arrow::random::pcg32 gen(seed);
+ std::string s;
+ s.reserve(num_chars * 3); // Reserve for average 3 bytes per codepoint
+
+ std::uniform_int_distribution<uint32_t> plane_dist(0, 3);
+ std::bernoulli_distribution bmp_range_dist(0.5);
+ std::uniform_int_distribution<uint32_t> bmp_lower_dist(0x0020, 0xD7FF);
+ std::uniform_int_distribution<uint32_t> bmp_upper_dist(0xE000, 0xFFFD);
+ std::uniform_int_distribution<uint32_t> smp_dist(0x10000, 0x1FFFF);
+ std::uniform_int_distribution<uint32_t> sip_dist(0x20000, 0x2FFFF);
+ std::uniform_int_distribution<uint32_t> high_plane_dist(0x30000, 0x10FFFF);
+
+ for (int i = 0; i < num_chars; ++i) {
+ uint32_t codepoint;
+ uint32_t plane = plane_dist(gen);
+
+ if (plane == 0) {
+ // Basic Multilingual Plane (BMP): U+0000 to U+FFFF
+ // Exclude surrogate code points (U+D800 to U+DFFF)
+ // https://www.unicode.org/versions/Unicode15.1.0/ch03.pdf (Section 3.8,
D71)
+ // Exclude control chars below U+0020 for readability
+ // Generate from two ranges with equal probability (overrepresents the
smaller
+ // upper range):
+ // - Lower: U+0020 to U+D7FF (55,776 values, 50% selection probability)
+ // - Upper: U+E000 to U+FFFD (8,190 values, 50% selection probability)
+ if (bmp_range_dist(gen)) {
+ // Lower range: U+0020 to U+D7FF (before surrogate range)
+ codepoint = bmp_lower_dist(gen);
+ } else {
+ // Upper range: U+E000 to U+FFFD (after surrogate range)
+ // Note: Stops at U+FFFD to exclude noncharacters U+FFFE and U+FFFF
+ // Other noncharacters (U+FDD0-U+FDEF, plane-ending pairs) are included
+ // as they are valid Unicode scalar values per the Unicode Standard
+ codepoint = bmp_upper_dist(gen);
+ }
+ } else if (plane == 1) {
+ // Supplementary Multilingual Plane (SMP): U+10000 to U+1FFFF
+ // https://www.unicode.org/roadmaps/smp/
+ codepoint = smp_dist(gen);
+ } else if (plane == 2) {
+ // Supplementary Ideographic Plane (SIP): U+20000 to U+2FFFF
+ // https://www.unicode.org/roadmaps/sip/
+ codepoint = sip_dist(gen);
+ } else {
+ // Planes 3–16: U+30000–U+10FFFF
+ // Includes TIP, SSP, PUA-A, PUA-B, and unassigned planes: U+30000 to
U+10FFFF
+ // Max valid Unicode codepoint is U+10FFFF per the Standard
+ // https://www.unicode.org/versions/Unicode15.1.0/ch03.pdf (Section 3.4,
D9)
+ codepoint = high_plane_dist(gen);
+ }
+
+ // Encode as UTF-8 per RFC 3629 (Section 3: UTF-8 definition)
+ // https://www.rfc-editor.org/rfc/rfc3629.html#section-3
+ if (codepoint <= 0x7F) {
+ // 1-byte sequence: 0xxxxxxx
+ s.push_back(static_cast<char>(codepoint));
+ } else if (codepoint <= 0x7FF) {
+ // 2-byte sequence: 110xxxxx 10xxxxxx
+ s.push_back(static_cast<char>(0xC0 | (codepoint >> 6)));
+ s.push_back(static_cast<char>(0x80 | (codepoint & 0x3F)));
+ } else if (codepoint <= 0xFFFF) {
+ // 3-byte sequence: 1110xxxx 10xxxxxx 10xxxxxx
+ s.push_back(static_cast<char>(0xE0 | (codepoint >> 12)));
+ s.push_back(static_cast<char>(0x80 | ((codepoint >> 6) & 0x3F)));
+ s.push_back(static_cast<char>(0x80 | (codepoint & 0x3F)));
+ } else {
+ // 4-byte sequence: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+ s.push_back(static_cast<char>(0xF0 | (codepoint >> 18)));
+ s.push_back(static_cast<char>(0x80 | ((codepoint >> 12) & 0x3F)));
+ s.push_back(static_cast<char>(0x80 | ((codepoint >> 6) & 0x3F)));
+ s.push_back(static_cast<char>(0x80 | (codepoint & 0x3F)));
+ }
+ }
+ return s;
+}
+
} // namespace arrow
diff --git a/cpp/src/arrow/testing/random.h b/cpp/src/arrow/testing/random.h
index d9122915a0..f820e64398 100644
--- a/cpp/src/arrow/testing/random.h
+++ b/cpp/src/arrow/testing/random.h
@@ -729,6 +729,21 @@ ARROW_TESTING_EXPORT
void rand_month_day_nanos(int64_t N,
std::vector<MonthDayNanoIntervalType::MonthDayNanos>* out);
+/// \brief Generate a random UTF-8 encoded string
+///
+/// Generates a string with valid UTF-8 encoding from random Unicode scalar
values.
+/// The generated string contains num_chars code points sampled uniformly
+/// across the Basic Multilingual Plane (BMP), Supplementary Multilingual
Plane (SMP),
+/// Supplementary Ideographic Plane (SIP), and higher planes (up to U+10FFFF).
+/// Surrogate code points (U+D800-U+DFFF) are excluded as they are not valid
+/// Unicode scalar values.
+///
+/// \param[in] seed Random seed for reproducibility
+/// \param[in] num_chars Number of Unicode code points to generate
+/// \return a generated UTF-8 encoded string
+ARROW_TESTING_EXPORT
+std::string RandomUtf8String(random::SeedType seed, int num_chars);
+
template <typename T, typename U>
void randint(int64_t N, T lower, T upper, std::vector<U>* out) {
const int random_seed = 0;