This is an automated email from the ASF dual-hosted git repository. ggregory pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/commons-io.git
The following commit(s) were added to refs/heads/master by this push: new dcd4f550 Refactor internals for better Unicode processing dcd4f550 is described below commit dcd4f5505e3aab7957029adaa3d2aa365ca9e612 Author: Gary Gregory <gardgreg...@gmail.com> AuthorDate: Tue Jun 14 18:06:10 2022 -0400 Refactor internals for better Unicode processing Add FileSystem.getIllegalFileNameCodePoints() --- src/changes/changes.xml | 3 + .../java/org/apache/commons/io/FileSystem.java | 199 +++++++++++---------- 2 files changed, 105 insertions(+), 97 deletions(-) diff --git a/src/changes/changes.xml b/src/changes/changes.xml index 31122869..733c273d 100644 --- a/src/changes/changes.xml +++ b/src/changes/changes.xml @@ -371,6 +371,9 @@ The <action> type attribute can be add,update,fix,remove. <action dev="ggregory" type="add" due-to="Gary Gregory"> Add PathUtils.touch(Path). </action> + <action dev="ggregory" type="add" due-to="Gary Gregory"> + Add Add FileSystem.getIllegalFileNameCodePoints(). + </action> <!-- UPDATE --> <action dev="kinow" type="update" due-to="Dependabot, Gary Gregory"> Bump actions/cache from 2.1.6 to 3.0.4 #307, #337. diff --git a/src/main/java/org/apache/commons/io/FileSystem.java b/src/main/java/org/apache/commons/io/FileSystem.java index 95f8bbe1..d42db03c 100644 --- a/src/main/java/org/apache/commons/io/FileSystem.java +++ b/src/main/java/org/apache/commons/io/FileSystem.java @@ -36,12 +36,12 @@ public enum FileSystem { /** * Generic file system. */ - GENERIC(false, false, Integer.MAX_VALUE, Integer.MAX_VALUE, new char[] { 0 }, new String[] {}, false, false, '/'), + GENERIC(false, false, Integer.MAX_VALUE, Integer.MAX_VALUE, new int[] { 0 }, new String[] {}, false, false, '/'), /** * Linux file system. */ - LINUX(true, true, 255, 4096, new char[] { + LINUX(true, true, 255, 4096, new int[] { // KEEP THIS ARRAY SORTED! // @formatter:off // ASCII NUL @@ -53,7 +53,7 @@ public enum FileSystem { /** * MacOS file system. */ - MAC_OSX(true, true, 255, 1024, new char[] { + MAC_OSX(true, true, 255, 1024, new int[] { // KEEP THIS ARRAY SORTED! // @formatter:off // ASCII NUL @@ -77,7 +77,7 @@ public enum FileSystem { * CreateFileA function - Consoles (microsoft.com)</a> */ WINDOWS(false, true, 255, - 32000, new char[] { + 32000, new int[] { // KEEP THIS ARRAY SORTED! // @formatter:off // ASCII NUL @@ -193,6 +193,79 @@ public enum FileSystem { } } + /** + * Copied from Apache Commons Lang CharSequenceUtils. + * + * Returns the index within {@code cs} of the first occurrence of the + * specified character, starting the search at the specified index. + * <p> + * If a character with value {@code searchChar} occurs in the + * character sequence represented by the {@code cs} + * object at an index no smaller than {@code start}, then + * the index of the first such occurrence is returned. For values + * of {@code searchChar} in the range from 0 to 0xFFFF (inclusive), + * this is the smallest value <i>k</i> such that: + * </p> + * <blockquote><pre> + * (this.charAt(<i>k</i>) == searchChar) && (<i>k</i> >= start) + * </pre></blockquote> + * is true. For other values of {@code searchChar}, it is the + * smallest value <i>k</i> such that: + * <blockquote><pre> + * (this.codePointAt(<i>k</i>) == searchChar) && (<i>k</i> >= start) + * </pre></blockquote> + * <p> + * is true. In either case, if no such character occurs inm {@code cs} + * at or after position {@code start}, then + * {@code -1} is returned. + * </p> + * <p> + * There is no restriction on the value of {@code start}. If it + * is negative, it has the same effect as if it were zero: the entire + * {@code CharSequence} may be searched. If it is greater than + * the length of {@code cs}, it has the same effect as if it were + * equal to the length of {@code cs}: {@code -1} is returned. + * </p> + * <p>All indices are specified in {@code char} values + * (Unicode code units). + * </p> + * + * @param cs the {@code CharSequence} to be processed, not null + * @param searchChar the char to be searched for + * @param start the start index, negative starts at the string start + * @return the index where the search char was found, -1 if not found + * @since 3.6 updated to behave more like {@code String} + */ + private static int indexOf(final CharSequence cs, final int searchChar, int start) { + if (cs instanceof String) { + return ((String) cs).indexOf(searchChar, start); + } + final int sz = cs.length(); + if (start < 0) { + start = 0; + } + if (searchChar < Character.MIN_SUPPLEMENTARY_CODE_POINT) { + for (int i = start; i < sz; i++) { + if (cs.charAt(i) == searchChar) { + return i; + } + } + return -1; + } + //supplementary characters (LANG1300) + if (searchChar <= Character.MAX_CODE_POINT) { + final char[] chars = Character.toChars(searchChar); + for (int i = start; i < sz - 1; i++) { + final char high = cs.charAt(i); + final char low = cs.charAt(i + 1); + if (high == chars[0] && low == chars[1]) { + return i; + } + } + } + return -1; + } + /** * Decides if the operating system matches. * <p> @@ -223,16 +296,16 @@ public enum FileSystem { private static String replace(final String path, final char oldChar, final char newChar) { return path == null ? null : path.replace(oldChar, newChar); } - private final boolean casePreserving; private final boolean caseSensitive; - private final char[] illegalFileNameChars; + private final int[] illegalFileNameChars; private final int maxFileNameLength; private final int maxPathLength; private final String[] reservedFileNames; private final boolean reservedFileNamesExtensions; private final boolean supportsDriveLetter; private final char nameSeparator; + private final char nameSeparatorOther; /** @@ -249,7 +322,7 @@ public enum FileSystem { * @param nameSeparator The name separator, '\\' on Windows, '/' on Linux. */ FileSystem(final boolean caseSensitive, final boolean casePreserving, final int maxFileLength, - final int maxPathLength, final char[] illegalFileNameChars, final String[] reservedFileNames, + final int maxPathLength, final int[] illegalFileNameChars, final String[] reservedFileNames, final boolean reservedFileNamesExtensions, final boolean supportsDriveLetter, final char nameSeparator) { this.maxFileNameLength = maxFileLength; this.maxPathLength = maxPathLength; @@ -269,6 +342,20 @@ public enum FileSystem { * @return the illegal characters for this file system. */ public char[] getIllegalFileNameChars() { + final char[] chars = new char[illegalFileNameChars.length]; + for (int i = 0; i < illegalFileNameChars.length; i++) { + chars[i] = (char) illegalFileNameChars[i]; + } + return chars; + } + + /** + * Gets a cloned copy of the illegal code points for this file system. + * + * @return the illegal code points for this file system. + * @since 2.12.0 + */ + public int[] getIllegalFileNameCodePoints() { return this.illegalFileNameChars.clone(); } @@ -335,7 +422,7 @@ public enum FileSystem { * the character to test * @return {@code true} if the given character is illegal in a file name, {@code false} otherwise. */ - private boolean isIllegalFileNameChar(final char c) { + private boolean isIllegalFileNameChar(final int c) { return Arrays.binarySearch(illegalFileNameChars, c) >= 0; } @@ -355,7 +442,7 @@ public enum FileSystem { if (isReservedFileName(candidate)) { return false; } - return candidate.chars().noneMatch(i -> isIllegalFileNameChar((char) i)); + return candidate.chars().noneMatch(this::isIllegalFileNameChar); } /** @@ -411,95 +498,13 @@ public enum FileSystem { */ public String toLegalFileName(final String candidate, final char replacement) { if (isIllegalFileNameChar(replacement)) { - throw new IllegalArgumentException( - String.format("The replacement character '%s' cannot be one of the %s illegal characters: %s", - // %s does not work properly with NUL - replacement == '\0' ? "\\0" : replacement, name(), Arrays.toString(illegalFileNameChars))); - } - final String truncated = candidate.length() > maxFileNameLength ? candidate.substring(0, maxFileNameLength) - : candidate; - boolean changed = false; - final char[] charArray = truncated.toCharArray(); - for (int i = 0; i < charArray.length; i++) { - if (isIllegalFileNameChar(charArray[i])) { - charArray[i] = replacement; - changed = true; - } - } - return changed ? String.valueOf(charArray) : truncated; - } - - /** - * Copied from Apache Commons Lang CharSequenceUtils. - * - * Returns the index within {@code cs} of the first occurrence of the - * specified character, starting the search at the specified index. - * <p> - * If a character with value {@code searchChar} occurs in the - * character sequence represented by the {@code cs} - * object at an index no smaller than {@code start}, then - * the index of the first such occurrence is returned. For values - * of {@code searchChar} in the range from 0 to 0xFFFF (inclusive), - * this is the smallest value <i>k</i> such that: - * </p> - * <blockquote><pre> - * (this.charAt(<i>k</i>) == searchChar) && (<i>k</i> >= start) - * </pre></blockquote> - * is true. For other values of {@code searchChar}, it is the - * smallest value <i>k</i> such that: - * <blockquote><pre> - * (this.codePointAt(<i>k</i>) == searchChar) && (<i>k</i> >= start) - * </pre></blockquote> - * <p> - * is true. In either case, if no such character occurs inm {@code cs} - * at or after position {@code start}, then - * {@code -1} is returned. - * </p> - * <p> - * There is no restriction on the value of {@code start}. If it - * is negative, it has the same effect as if it were zero: the entire - * {@code CharSequence} may be searched. If it is greater than - * the length of {@code cs}, it has the same effect as if it were - * equal to the length of {@code cs}: {@code -1} is returned. - * </p> - * <p>All indices are specified in {@code char} values - * (Unicode code units). - * </p> - * - * @param cs the {@code CharSequence} to be processed, not null - * @param searchChar the char to be searched for - * @param start the start index, negative starts at the string start - * @return the index where the search char was found, -1 if not found - * @since 3.6 updated to behave more like {@code String} - */ - private static int indexOf(final CharSequence cs, final int searchChar, int start) { - if (cs instanceof String) { - return ((String) cs).indexOf(searchChar, start); + // %s does not work properly with NUL + throw new IllegalArgumentException(String.format("The replacement character '%s' cannot be one of the %s illegal characters: %s", + replacement == '\0' ? "\\0" : replacement, name(), Arrays.toString(illegalFileNameChars))); } - final int sz = cs.length(); - if (start < 0) { - start = 0; - } - if (searchChar < Character.MIN_SUPPLEMENTARY_CODE_POINT) { - for (int i = start; i < sz; i++) { - if (cs.charAt(i) == searchChar) { - return i; - } - } - return -1; - } - //supplementary characters (LANG1300) - if (searchChar <= Character.MAX_CODE_POINT) { - final char[] chars = Character.toChars(searchChar); - for (int i = start; i < sz - 1; i++) { - final char high = cs.charAt(i); - final char low = cs.charAt(i + 1); - if (high == chars[0] && low == chars[1]) { - return i; - } - } - } - return -1; + final String truncated = candidate.length() > maxFileNameLength ? candidate.substring(0, maxFileNameLength) : candidate; + final int[] array = truncated.chars().map(i -> isIllegalFileNameChar(i) ? replacement : i).toArray(); + return new String(array, 0, array.length); } CharSequence trimExtension(final CharSequence cs) {