This is an automated email from the ASF dual-hosted git repository.

alenka pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new 233590474f GH-48448: [Python] Implement Alphanumeric and Surrogate 
text in the random schema generator (#48449)
233590474f is described below

commit 233590474f0306fbf16f1c282bdae517a482c579
Author: Hyukjin Kwon <[email protected]>
AuthorDate: Tue Jan 6 17:24:54 2026 +0900

    GH-48448: [Python] Implement Alphanumeric and Surrogate text in the random 
schema generator (#48449)
    
    ### Rationale for this change
    
    To test various cases of field names:
    
    
https://github.com/apache/arrow/blob/6456944f5092dedb3f80d9bc80400e857d6571c7/python/pyarrow/tests/strategies.py#L49
    
    It was introduced from 
https://github.com/apache/arrow/commit/9da458437162574f3e0d82e4a51dc6c1589b9f94
    
    ### What changes are included in this PR?
    
    This PR implements Alphanumeric and Surrogate text in the random schema 
generator
    
    ### Are these changes tested?
    
    Yes I tested them via:
    
    ```
    PYARROW_TEST_HYPOTHESIS=1 pytest -xvs 
pyarrow/tests/test_strategies.py::test_fields --hypothesis-show-statistics
    ```
    
    ### Are there any user-facing changes?
    
    No, test-only.
    * GitHub Issue: #48448
    
    Authored-by: Hyukjin Kwon <[email protected]>
    Signed-off-by: AlenkaF <[email protected]>
---
 python/pyarrow/tests/strategies.py | 31 +++++++++++++++++++++++++++----
 1 file changed, 27 insertions(+), 4 deletions(-)

diff --git a/python/pyarrow/tests/strategies.py 
b/python/pyarrow/tests/strategies.py
index 8319c9ce3e..3c31650ddf 100644
--- a/python/pyarrow/tests/strategies.py
+++ b/python/pyarrow/tests/strategies.py
@@ -46,7 +46,7 @@ except ImportError:
 import pyarrow as pa
 
 
-# TODO(kszucs): alphanum_text, surrogate_text
+# Text generation strategies for various character sets
 custom_text = st.text(
     alphabet=st.characters(
         min_codepoint=0x41,
@@ -54,6 +54,23 @@ custom_text = st.text(
     )
 )
 
+# alphanum_text: Only alphanumeric characters (a-z, A-Z, 0-9)
+alphanum_text = st.text(
+    alphabet=st.characters(
+        whitelist_categories=('Ll', 'Lu', 'Nd'),  # Lowercase, Uppercase, 
Decimal Number
+        min_codepoint=0x30,  # Start from '0' (U+0030)
+        max_codepoint=0x7A   # End at 'z' (U+007A)
+    )
+)
+
+# surrogate_text: Unicode supplementary planes (U+10000 to U+10FFFF)
+surrogate_text = st.text(
+    alphabet=st.characters(
+        min_codepoint=0x10000,  # Start of Plane 1 (Supplementary Multilingual 
Plane)
+        max_codepoint=0x10FFFF  # End of valid Unicode range (last code point)
+    )
+)
+
 null_type = st.just(pa.null())
 bool_type = st.just(pa.bool_())
 
@@ -164,8 +181,10 @@ metadata = st.dictionaries(st.text(), st.text())
 
 
 @st.composite
-def fields(draw, type_strategy=primitive_types):
-    name = draw(custom_text)
+def fields(draw, type_strategy=primitive_types, name_strategy=None):
+    if name_strategy is None:
+        name_strategy = custom_text
+    name = draw(name_strategy)
     typ = draw(type_strategy)
     if pa.types.is_null(typ):
         nullable = True
@@ -243,7 +262,11 @@ all_types = st.deferred(
         struct_types(all_types)
     )
 )
-all_fields = fields(all_types)
+all_fields = st.one_of(
+    fields(all_types),  # custom_text
+    fields(all_types, name_strategy=alphanum_text),
+    fields(all_types, name_strategy=surrogate_text)
+)
 all_schemas = schemas(all_types)
 
 

Reply via email to