This is an automated email from the ASF dual-hosted git repository.

ggregory pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/commons-lang.git


The following commit(s) were added to refs/heads/master by this push:
     new fd866a9c2 Fix handling of non-ASCII letters & numbers in 
RandomStringUtils (#1273)
fd866a9c2 is described below

commit fd866a9c2d6c69f9f6462a5e498b58293597d611
Author: Fabrice Benhamouda <1146316+fabrice...@users.noreply.github.com>
AuthorDate: Tue Sep 10 20:47:24 2024 -0400

    Fix handling of non-ASCII letters & numbers in RandomStringUtils (#1273)
    
    An optimization introduced in commit
    f382d61a03778ccf838c6c051bd8692e4834dec2
    incorrectly assumed that letters and numbers/digits are ASCII.
    
    For example, `RandomStringUtils.random(1, 0x4e00, 0x4e01, true, false)`
    would take too long to terminate and would not output the correct
    answer, that is the string with a single character `0x4e00`.
    The issue is that `end` would be replaced by `'z' + 1 = 123` (the latest
    ASCII letter + 1) there: 
https://github.com/apache/commons-lang/blob/f382d61a03778ccf838c6c051bd8692e4834dec2/src/main/java/org/apache/commons/lang3/RandomStringUtils.java#L266-L270
     Thus, we would have `end < start` and `gap = end - start < 0`.
    
    This PR fixes this issue by restricting the optimization to cases
    where only ASCII characters are requested, that is `end < 0x7f`.
    This PR also slightly clarifies the code, using constant variables
    instead of '0', '9', 'A', and 'z'.
    
    The PR also includes two regression tests.
    
    Co-authored-by: Fabrice Benhamouda <yfabr...@amazon.com>
---
 .../apache/commons/lang3/RandomStringUtils.java    | 65 ++++++++++----------
 .../commons/lang3/RandomStringUtilsTest.java       | 70 ++++++++++++++++++++++
 2 files changed, 105 insertions(+), 30 deletions(-)

diff --git a/src/main/java/org/apache/commons/lang3/RandomStringUtils.java 
b/src/main/java/org/apache/commons/lang3/RandomStringUtils.java
index ca2f4a107..573f9ae5e 100644
--- a/src/main/java/org/apache/commons/lang3/RandomStringUtils.java
+++ b/src/main/java/org/apache/commons/lang3/RandomStringUtils.java
@@ -277,45 +277,50 @@ public class RandomStringUtils {
             end = Character.MAX_CODE_POINT;
         }
 
-        // Optimize generation of full alphanumerical characters
-        // Normally, we would need to pick a 7-bit integer, since gap = 'z' - 
'0' + 1 = 75 > 64
-        // In turn, this would make us reject the sampling with probability 1 
- 62 / 2^7 > 1 / 2
-        // Instead we can pick directly from the right set of 62 characters, 
which requires
-        // picking a 6-bit integer and only rejecting with probability 2 / 64 
= 1 / 32
-        if (chars == null && letters && numbers && start <= '0' && end >= 'z' 
+ 1) {
-            return random(count, 0, 0, false, false, ALPHANUMERICAL_CHARS, 
random);
-        }
+        // Optimizations and tests when chars == null and using ASCII 
characters (end <= 0x7f)
+        if (chars == null && end <= 0x7f) {
+            final int zeroDigitAscii = 48; // '0'
+            final int lastDigitAscii = 57; // '9'
+            final int firstLetterAscii = 65; // 'A'
+            final int lastLetterAscii = 122; // 'z'
+
+            // Optimize generation of full alphanumerical characters
+            // Normally, we would need to pick a 7-bit integer, since gap = 
'z' - '0' + 1 = 75 > 64
+            // In turn, this would make us reject the sampling with 
probability 1 - 62 / 2^7 > 1 / 2
+            // Instead we can pick directly from the right set of 62 
characters, which requires
+            // picking a 6-bit integer and only rejecting with probability 2 / 
64 = 1 / 32
+            if (letters && numbers && start <= zeroDigitAscii && end >= 
lastLetterAscii + 1) {
+                return random(count, 0, 0, false, false, ALPHANUMERICAL_CHARS, 
random);
+            }
 
-        // Optimize start and end when filtering by letters and/or numbers:
-        // The range provided may be too large since we filter anyway 
afterward.
-        // Note the use of Math.min/max (as opposed to setting start to '0' 
for example),
-        // since it is possible the range start/end excludes some of the 
letters/numbers,
-        // e.g., it is possible that start already is '1' when numbers = true, 
and start
-        // needs to stay equal to '1' in that case.
-        if (chars == null) {
+            if (numbers && end <= zeroDigitAscii || letters && end <= 
firstLetterAscii) {
+                throw new IllegalArgumentException(
+                        "Parameter end (" + end + ") must be greater then (" + 
zeroDigitAscii + ") for generating digits "
+                                + "or greater then (" + firstLetterAscii + ") 
for generating letters.");
+            }
+
+            // Optimize start and end when filtering by letters and/or numbers:
+            // The range provided may be too large since we filter anyway 
afterward.
+            // Note the use of Math.min/max (as opposed to setting start to 
'0' for example),
+            // since it is possible the range start/end excludes some of the 
letters/numbers,
+            // e.g., it is possible that start already is '1' when numbers = 
true, and start
+            // needs to stay equal to '1' in that case.
+            // Note that because of the above test, we will always have start 
< end
+            // even after this optimization.
             if (letters && numbers) {
-                start = Math.max('0', start);
-                end = Math.min('z' + 1, end);
+                start = Math.max(zeroDigitAscii, start);
+                end = Math.min(lastLetterAscii + 1, end);
             } else if (numbers) {
                 // just numbers, no letters
-                start = Math.max('0', start);
-                end = Math.min('9' + 1, end);
+                start = Math.max(zeroDigitAscii, start);
+                end = Math.min(lastDigitAscii + 1, end);
             } else if (letters) {
                 // just letters, no numbers
-                start = Math.max('A', start);
-                end = Math.min('z' + 1, end);
+                start = Math.max(firstLetterAscii, start);
+                end = Math.min(lastLetterAscii + 1, end);
             }
         }
 
-        final int zeroDigitAscii = 48;
-        final int firstLetterAscii = 65;
-
-        if (chars == null && (numbers && end <= zeroDigitAscii || letters && 
end <= firstLetterAscii)) {
-            throw new IllegalArgumentException(
-                    "Parameter end (" + end + ") must be greater then (" + 
zeroDigitAscii + ") for generating digits "
-                            + "or greater then (" + firstLetterAscii + ") for 
generating letters.");
-        }
-
         final StringBuilder builder = new StringBuilder(count);
         final int gap = end - start;
         final int gapBits = Integer.SIZE - Integer.numberOfLeadingZeros(gap);
diff --git a/src/test/java/org/apache/commons/lang3/RandomStringUtilsTest.java 
b/src/test/java/org/apache/commons/lang3/RandomStringUtilsTest.java
index 4250f7b1b..2fb68e299 100644
--- a/src/test/java/org/apache/commons/lang3/RandomStringUtilsTest.java
+++ b/src/test/java/org/apache/commons/lang3/RandomStringUtilsTest.java
@@ -732,4 +732,74 @@ public class RandomStringUtilsTest extends 
AbstractLangTest {
         assertNotEquals(r1, r3);
         assertNotEquals(r2, r3);
     }
+
+    /**
+     * Test {@code RandomStringUtils.random} works appropriately when 
letters=true
+     * and the range does not only include ASCII letters.
+     * Fails with probability less than 2^-40 (in practice this never happens).
+     */
+    @ParameterizedTest
+    @MethodSource("randomProvider")
+    void testNonASCIILetters(final RandomStringUtils rsu) {
+        // Check that the following create a string with 10 characters 0x4e00 
(a non-ASCII letter)
+        String r1 = rsu.next(10, 0x4e00, 0x4e01, true, false);
+        assertEquals(10, r1.length(), "wrong length");
+        for (int i = 0; i < r1.length(); i++) {
+            assertEquals(0x4e00, r1.charAt(i), "characters not all equal to 
0x4e00");
+        }
+
+        // Same with both letters=true and numbers=true
+        r1 = rsu.next(10, 0x4e00, 0x4e01, true, true);
+        assertEquals(10, r1.length(), "wrong length");
+        for (int i = 0; i < r1.length(); i++) {
+            assertEquals(0x4e00, r1.charAt(i), "characters not all equal to 
0x4e00");
+        }
+
+        // Check that at least one letter is not ASCII
+        boolean found = false;
+        r1 = rsu.next(40, 'F', 0x3000, true, false);
+        assertEquals(40, r1.length(), "wrong length");
+        for (int i = 0; i < r1.length(); i++) {
+            assertTrue(Character.isLetter(r1.charAt(i)), "characters not all 
letters");
+            if (r1.charAt(i) > 0x7f) {
+                found = true;
+            }
+        }
+        assertTrue(found, "no non-ASCII letter generated");
+    }
+
+    /**
+     * Test {@code RandomStringUtils.random} works appropriately when 
numbers=true
+     * and the range does not only include ASCII numbers/digits.
+     * Fails with probability less than 2^-40 (in practice this never happens).
+     */
+    @ParameterizedTest
+    @MethodSource("randomProvider")
+    void testNonASCIINumbers(final RandomStringUtils rsu) {
+        // Check that the following create a string with 10 characters 0x0660 
(a non-ASCII digit)
+        String r1 = rsu.next(10, 0x0660, 0x0661, false, true);
+        assertEquals(10, r1.length(), "wrong length");
+        for (int i = 0; i < r1.length(); i++) {
+            assertEquals(0x0660, r1.charAt(i), "characters not all equal to 
0x0660");
+        }
+
+        // Same with both letters=true and numbers=true
+        r1 = rsu.next(10, 0x0660, 0x0661, true, true);
+        assertEquals(10, r1.length(), "wrong length");
+        for (int i = 0; i < r1.length(); i++) {
+            assertEquals(0x0660, r1.charAt(i), "characters not all equal to 
0x0660");
+        }
+
+        // Check that at least one letter is not ASCII
+        boolean found = false;
+        r1 = rsu.next(40, 'F', 0x3000, false, true);
+        assertEquals(40, r1.length(), "wrong length");
+        for (int i = 0; i < r1.length(); i++) {
+            assertTrue(Character.isDigit(r1.charAt(i)), "characters not all 
numbers");
+            if (r1.charAt(i) > 0x7f) {
+                found = true;
+            }
+        }
+        assertTrue(found, "no non-ASCII number generated");
+    }
 }

Reply via email to