This is an automated email from the ASF dual-hosted git repository. ggregory pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/commons-lang.git
The following commit(s) were added to refs/heads/master by this push: new fd866a9c2 Fix handling of non-ASCII letters & numbers in RandomStringUtils (#1273) fd866a9c2 is described below commit fd866a9c2d6c69f9f6462a5e498b58293597d611 Author: Fabrice Benhamouda <1146316+fabrice...@users.noreply.github.com> AuthorDate: Tue Sep 10 20:47:24 2024 -0400 Fix handling of non-ASCII letters & numbers in RandomStringUtils (#1273) An optimization introduced in commit f382d61a03778ccf838c6c051bd8692e4834dec2 incorrectly assumed that letters and numbers/digits are ASCII. For example, `RandomStringUtils.random(1, 0x4e00, 0x4e01, true, false)` would take too long to terminate and would not output the correct answer, that is the string with a single character `0x4e00`. The issue is that `end` would be replaced by `'z' + 1 = 123` (the latest ASCII letter + 1) there: https://github.com/apache/commons-lang/blob/f382d61a03778ccf838c6c051bd8692e4834dec2/src/main/java/org/apache/commons/lang3/RandomStringUtils.java#L266-L270 Thus, we would have `end < start` and `gap = end - start < 0`. This PR fixes this issue by restricting the optimization to cases where only ASCII characters are requested, that is `end < 0x7f`. This PR also slightly clarifies the code, using constant variables instead of '0', '9', 'A', and 'z'. The PR also includes two regression tests. Co-authored-by: Fabrice Benhamouda <yfabr...@amazon.com> --- .../apache/commons/lang3/RandomStringUtils.java | 65 ++++++++++---------- .../commons/lang3/RandomStringUtilsTest.java | 70 ++++++++++++++++++++++ 2 files changed, 105 insertions(+), 30 deletions(-) diff --git a/src/main/java/org/apache/commons/lang3/RandomStringUtils.java b/src/main/java/org/apache/commons/lang3/RandomStringUtils.java index ca2f4a107..573f9ae5e 100644 --- a/src/main/java/org/apache/commons/lang3/RandomStringUtils.java +++ b/src/main/java/org/apache/commons/lang3/RandomStringUtils.java @@ -277,45 +277,50 @@ public class RandomStringUtils { end = Character.MAX_CODE_POINT; } - // Optimize generation of full alphanumerical characters - // Normally, we would need to pick a 7-bit integer, since gap = 'z' - '0' + 1 = 75 > 64 - // In turn, this would make us reject the sampling with probability 1 - 62 / 2^7 > 1 / 2 - // Instead we can pick directly from the right set of 62 characters, which requires - // picking a 6-bit integer and only rejecting with probability 2 / 64 = 1 / 32 - if (chars == null && letters && numbers && start <= '0' && end >= 'z' + 1) { - return random(count, 0, 0, false, false, ALPHANUMERICAL_CHARS, random); - } + // Optimizations and tests when chars == null and using ASCII characters (end <= 0x7f) + if (chars == null && end <= 0x7f) { + final int zeroDigitAscii = 48; // '0' + final int lastDigitAscii = 57; // '9' + final int firstLetterAscii = 65; // 'A' + final int lastLetterAscii = 122; // 'z' + + // Optimize generation of full alphanumerical characters + // Normally, we would need to pick a 7-bit integer, since gap = 'z' - '0' + 1 = 75 > 64 + // In turn, this would make us reject the sampling with probability 1 - 62 / 2^7 > 1 / 2 + // Instead we can pick directly from the right set of 62 characters, which requires + // picking a 6-bit integer and only rejecting with probability 2 / 64 = 1 / 32 + if (letters && numbers && start <= zeroDigitAscii && end >= lastLetterAscii + 1) { + return random(count, 0, 0, false, false, ALPHANUMERICAL_CHARS, random); + } - // Optimize start and end when filtering by letters and/or numbers: - // The range provided may be too large since we filter anyway afterward. - // Note the use of Math.min/max (as opposed to setting start to '0' for example), - // since it is possible the range start/end excludes some of the letters/numbers, - // e.g., it is possible that start already is '1' when numbers = true, and start - // needs to stay equal to '1' in that case. - if (chars == null) { + if (numbers && end <= zeroDigitAscii || letters && end <= firstLetterAscii) { + throw new IllegalArgumentException( + "Parameter end (" + end + ") must be greater then (" + zeroDigitAscii + ") for generating digits " + + "or greater then (" + firstLetterAscii + ") for generating letters."); + } + + // Optimize start and end when filtering by letters and/or numbers: + // The range provided may be too large since we filter anyway afterward. + // Note the use of Math.min/max (as opposed to setting start to '0' for example), + // since it is possible the range start/end excludes some of the letters/numbers, + // e.g., it is possible that start already is '1' when numbers = true, and start + // needs to stay equal to '1' in that case. + // Note that because of the above test, we will always have start < end + // even after this optimization. if (letters && numbers) { - start = Math.max('0', start); - end = Math.min('z' + 1, end); + start = Math.max(zeroDigitAscii, start); + end = Math.min(lastLetterAscii + 1, end); } else if (numbers) { // just numbers, no letters - start = Math.max('0', start); - end = Math.min('9' + 1, end); + start = Math.max(zeroDigitAscii, start); + end = Math.min(lastDigitAscii + 1, end); } else if (letters) { // just letters, no numbers - start = Math.max('A', start); - end = Math.min('z' + 1, end); + start = Math.max(firstLetterAscii, start); + end = Math.min(lastLetterAscii + 1, end); } } - final int zeroDigitAscii = 48; - final int firstLetterAscii = 65; - - if (chars == null && (numbers && end <= zeroDigitAscii || letters && end <= firstLetterAscii)) { - throw new IllegalArgumentException( - "Parameter end (" + end + ") must be greater then (" + zeroDigitAscii + ") for generating digits " - + "or greater then (" + firstLetterAscii + ") for generating letters."); - } - final StringBuilder builder = new StringBuilder(count); final int gap = end - start; final int gapBits = Integer.SIZE - Integer.numberOfLeadingZeros(gap); diff --git a/src/test/java/org/apache/commons/lang3/RandomStringUtilsTest.java b/src/test/java/org/apache/commons/lang3/RandomStringUtilsTest.java index 4250f7b1b..2fb68e299 100644 --- a/src/test/java/org/apache/commons/lang3/RandomStringUtilsTest.java +++ b/src/test/java/org/apache/commons/lang3/RandomStringUtilsTest.java @@ -732,4 +732,74 @@ public class RandomStringUtilsTest extends AbstractLangTest { assertNotEquals(r1, r3); assertNotEquals(r2, r3); } + + /** + * Test {@code RandomStringUtils.random} works appropriately when letters=true + * and the range does not only include ASCII letters. + * Fails with probability less than 2^-40 (in practice this never happens). + */ + @ParameterizedTest + @MethodSource("randomProvider") + void testNonASCIILetters(final RandomStringUtils rsu) { + // Check that the following create a string with 10 characters 0x4e00 (a non-ASCII letter) + String r1 = rsu.next(10, 0x4e00, 0x4e01, true, false); + assertEquals(10, r1.length(), "wrong length"); + for (int i = 0; i < r1.length(); i++) { + assertEquals(0x4e00, r1.charAt(i), "characters not all equal to 0x4e00"); + } + + // Same with both letters=true and numbers=true + r1 = rsu.next(10, 0x4e00, 0x4e01, true, true); + assertEquals(10, r1.length(), "wrong length"); + for (int i = 0; i < r1.length(); i++) { + assertEquals(0x4e00, r1.charAt(i), "characters not all equal to 0x4e00"); + } + + // Check that at least one letter is not ASCII + boolean found = false; + r1 = rsu.next(40, 'F', 0x3000, true, false); + assertEquals(40, r1.length(), "wrong length"); + for (int i = 0; i < r1.length(); i++) { + assertTrue(Character.isLetter(r1.charAt(i)), "characters not all letters"); + if (r1.charAt(i) > 0x7f) { + found = true; + } + } + assertTrue(found, "no non-ASCII letter generated"); + } + + /** + * Test {@code RandomStringUtils.random} works appropriately when numbers=true + * and the range does not only include ASCII numbers/digits. + * Fails with probability less than 2^-40 (in practice this never happens). + */ + @ParameterizedTest + @MethodSource("randomProvider") + void testNonASCIINumbers(final RandomStringUtils rsu) { + // Check that the following create a string with 10 characters 0x0660 (a non-ASCII digit) + String r1 = rsu.next(10, 0x0660, 0x0661, false, true); + assertEquals(10, r1.length(), "wrong length"); + for (int i = 0; i < r1.length(); i++) { + assertEquals(0x0660, r1.charAt(i), "characters not all equal to 0x0660"); + } + + // Same with both letters=true and numbers=true + r1 = rsu.next(10, 0x0660, 0x0661, true, true); + assertEquals(10, r1.length(), "wrong length"); + for (int i = 0; i < r1.length(); i++) { + assertEquals(0x0660, r1.charAt(i), "characters not all equal to 0x0660"); + } + + // Check that at least one letter is not ASCII + boolean found = false; + r1 = rsu.next(40, 'F', 0x3000, false, true); + assertEquals(40, r1.length(), "wrong length"); + for (int i = 0; i < r1.length(); i++) { + assertTrue(Character.isDigit(r1.charAt(i)), "characters not all numbers"); + if (r1.charAt(i) > 0x7f) { + found = true; + } + } + assertTrue(found, "no non-ASCII number generated"); + } }