http://git-wip-us.apache.org/repos/asf/commons-text/blob/c7cf533d/src/main/java/org/apache/commons/text/StrTokenizer.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/text/StrTokenizer.java b/src/main/java/org/apache/commons/text/StrTokenizer.java new file mode 100644 index 0000000..8186f37 --- /dev/null +++ b/src/main/java/org/apache/commons/text/StrTokenizer.java @@ -0,0 +1,1118 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.text; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.ListIterator; +import java.util.NoSuchElementException; + +/** + * Tokenizes a string based based on delimiters (separators) + * and supporting quoting and ignored character concepts. + * <p> + * This class can split a String into many smaller strings. It aims + * to do a similar job to {@link java.util.StringTokenizer StringTokenizer}, + * however it offers much more control and flexibility including implementing + * the <code>ListIterator</code> interface. By default, it is set up + * like <code>StringTokenizer</code>. + * <p> + * The input String is split into a number of <i>tokens</i>. + * Each token is separated from the next String by a <i>delimiter</i>. + * One or more delimiter characters must be specified. + * <p> + * Each token may be surrounded by quotes. + * The <i>quote</i> matcher specifies the quote character(s). + * A quote may be escaped within a quoted section by duplicating itself. + * <p> + * Between each token and the delimiter are potentially characters that need trimming. + * The <i>trimmer</i> matcher specifies these characters. + * One usage might be to trim whitespace characters. + * <p> + * At any point outside the quotes there might potentially be invalid characters. + * The <i>ignored</i> matcher specifies these characters to be removed. + * One usage might be to remove new line characters. + * <p> + * Empty tokens may be removed or returned as null. + * <pre> + * "a,b,c" - Three tokens "a","b","c" (comma delimiter) + * " a, b , c " - Three tokens "a","b","c" (default CSV processing trims whitespace) + * "a, ", b ,", c" - Three tokens "a, " , " b ", ", c" (quoted text untouched) + * </pre> + * <p> + * + * This tokenizer has the following properties and options: + * + * <table summary="Tokenizer Properties"> + * <tr> + * <th>Property</th><th>Type</th><th>Default</th> + * </tr> + * <tr> + * <td>delim</td><td>CharSetMatcher</td><td>{ \t\n\r\f}</td> + * </tr> + * <tr> + * <td>quote</td><td>NoneMatcher</td><td>{}</td> + * </tr> + * <tr> + * <td>ignore</td><td>NoneMatcher</td><td>{}</td> + * </tr> + * <tr> + * <td>emptyTokenAsNull</td><td>boolean</td><td>false</td> + * </tr> + * <tr> + * <td>ignoreEmptyTokens</td><td>boolean</td><td>true</td> + * </tr> + * </table> + * + * @since 1.0 + */ +public class StrTokenizer implements ListIterator<String>, Cloneable { + + private static final StrTokenizer CSV_TOKENIZER_PROTOTYPE; + private static final StrTokenizer TSV_TOKENIZER_PROTOTYPE; + static { + CSV_TOKENIZER_PROTOTYPE = new StrTokenizer(); + CSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StrMatcher.commaMatcher()); + CSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StrMatcher.doubleQuoteMatcher()); + CSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StrMatcher.noneMatcher()); + CSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StrMatcher.trimMatcher()); + CSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false); + CSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false); + + TSV_TOKENIZER_PROTOTYPE = new StrTokenizer(); + TSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StrMatcher.tabMatcher()); + TSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StrMatcher.doubleQuoteMatcher()); + TSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StrMatcher.noneMatcher()); + TSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StrMatcher.trimMatcher()); + TSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false); + TSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false); + } + + /** The text to work on. */ + private char chars[]; + /** The parsed tokens */ + private String tokens[]; + /** The current iteration position */ + private int tokenPos; + + /** The delimiter matcher */ + private StrMatcher delimMatcher = StrMatcher.splitMatcher(); + /** The quote matcher */ + private StrMatcher quoteMatcher = StrMatcher.noneMatcher(); + /** The ignored matcher */ + private StrMatcher ignoredMatcher = StrMatcher.noneMatcher(); + /** The trimmer matcher */ + private StrMatcher trimmerMatcher = StrMatcher.noneMatcher(); + + /** Whether to return empty tokens as null */ + private boolean emptyAsNull = false; + /** Whether to ignore empty tokens */ + private boolean ignoreEmptyTokens = true; + + //----------------------------------------------------------------------- + + /** + * Returns a clone of <code>CSV_TOKENIZER_PROTOTYPE</code>. + * + * @return a clone of <code>CSV_TOKENIZER_PROTOTYPE</code>. + */ + private static StrTokenizer getCSVClone() { + return (StrTokenizer) CSV_TOKENIZER_PROTOTYPE.clone(); + } + + /** + * Gets a new tokenizer instance which parses Comma Separated Value strings + * initializing it with the given input. The default for CSV processing + * will be trim whitespace from both ends (which can be overridden with + * the setTrimmer method). + * <p> + * You must call a "reset" method to set the string which you want to parse. + * @return a new tokenizer instance which parses Comma Separated Value strings + */ + public static StrTokenizer getCSVInstance() { + return getCSVClone(); + } + + /** + * Gets a new tokenizer instance which parses Comma Separated Value strings + * initializing it with the given input. The default for CSV processing + * will be trim whitespace from both ends (which can be overridden with + * the setTrimmer method). + * + * @param input the text to parse + * @return a new tokenizer instance which parses Comma Separated Value strings + */ + public static StrTokenizer getCSVInstance(final String input) { + final StrTokenizer tok = getCSVClone(); + tok.reset(input); + return tok; + } + + /** + * Gets a new tokenizer instance which parses Comma Separated Value strings + * initializing it with the given input. The default for CSV processing + * will be trim whitespace from both ends (which can be overridden with + * the setTrimmer method). + * + * @param input the text to parse + * @return a new tokenizer instance which parses Comma Separated Value strings + */ + public static StrTokenizer getCSVInstance(final char[] input) { + final StrTokenizer tok = getCSVClone(); + tok.reset(input); + return tok; + } + + /** + * Returns a clone of <code>TSV_TOKENIZER_PROTOTYPE</code>. + * + * @return a clone of <code>TSV_TOKENIZER_PROTOTYPE</code>. + */ + private static StrTokenizer getTSVClone() { + return (StrTokenizer) TSV_TOKENIZER_PROTOTYPE.clone(); + } + + + /** + * Gets a new tokenizer instance which parses Tab Separated Value strings. + * The default for CSV processing will be trim whitespace from both ends + * (which can be overridden with the setTrimmer method). + * <p> + * You must call a "reset" method to set the string which you want to parse. + * @return a new tokenizer instance which parses Tab Separated Value strings. + */ + public static StrTokenizer getTSVInstance() { + return getTSVClone(); + } + + /** + * Gets a new tokenizer instance which parses Tab Separated Value strings. + * The default for CSV processing will be trim whitespace from both ends + * (which can be overridden with the setTrimmer method). + * @param input the string to parse + * @return a new tokenizer instance which parses Tab Separated Value strings. + */ + public static StrTokenizer getTSVInstance(final String input) { + final StrTokenizer tok = getTSVClone(); + tok.reset(input); + return tok; + } + + /** + * Gets a new tokenizer instance which parses Tab Separated Value strings. + * The default for CSV processing will be trim whitespace from both ends + * (which can be overridden with the setTrimmer method). + * @param input the string to parse + * @return a new tokenizer instance which parses Tab Separated Value strings. + */ + public static StrTokenizer getTSVInstance(final char[] input) { + final StrTokenizer tok = getTSVClone(); + tok.reset(input); + return tok; + } + + //----------------------------------------------------------------------- + /** + * Constructs a tokenizer splitting on space, tab, newline and formfeed + * as per StringTokenizer, but with no text to tokenize. + * <p> + * This constructor is normally used with {@link #reset(String)}. + */ + public StrTokenizer() { + super(); + this.chars = null; + } + + /** + * Constructs a tokenizer splitting on space, tab, newline and formfeed + * as per StringTokenizer. + * + * @param input the string which is to be parsed + */ + public StrTokenizer(final String input) { + super(); + if (input != null) { + chars = input.toCharArray(); + } else { + chars = null; + } + } + + /** + * Constructs a tokenizer splitting on the specified delimiter character. + * + * @param input the string which is to be parsed + * @param delim the field delimiter character + */ + public StrTokenizer(final String input, final char delim) { + this(input); + setDelimiterChar(delim); + } + + /** + * Constructs a tokenizer splitting on the specified delimiter string. + * + * @param input the string which is to be parsed + * @param delim the field delimiter string + */ + public StrTokenizer(final String input, final String delim) { + this(input); + setDelimiterString(delim); + } + + /** + * Constructs a tokenizer splitting using the specified delimiter matcher. + * + * @param input the string which is to be parsed + * @param delim the field delimiter matcher + */ + public StrTokenizer(final String input, final StrMatcher delim) { + this(input); + setDelimiterMatcher(delim); + } + + /** + * Constructs a tokenizer splitting on the specified delimiter character + * and handling quotes using the specified quote character. + * + * @param input the string which is to be parsed + * @param delim the field delimiter character + * @param quote the field quoted string character + */ + public StrTokenizer(final String input, final char delim, final char quote) { + this(input, delim); + setQuoteChar(quote); + } + + /** + * Constructs a tokenizer splitting using the specified delimiter matcher + * and handling quotes using the specified quote matcher. + * + * @param input the string which is to be parsed + * @param delim the field delimiter matcher + * @param quote the field quoted string matcher + */ + public StrTokenizer(final String input, final StrMatcher delim, final StrMatcher quote) { + this(input, delim); + setQuoteMatcher(quote); + } + + /** + * Constructs a tokenizer splitting on space, tab, newline and formfeed + * as per StringTokenizer. + * + * @param input the string which is to be parsed, not cloned + */ + public StrTokenizer(final char[] input) { + super(); + if (input == null) { + this.chars = null; + } else { + this.chars = input.clone(); + } + } + + /** + * Constructs a tokenizer splitting on the specified character. + * + * @param input the string which is to be parsed, not cloned + * @param delim the field delimiter character + */ + public StrTokenizer(final char[] input, final char delim) { + this(input); + setDelimiterChar(delim); + } + + /** + * Constructs a tokenizer splitting on the specified string. + * + * @param input the string which is to be parsed, not cloned + * @param delim the field delimiter string + */ + public StrTokenizer(final char[] input, final String delim) { + this(input); + setDelimiterString(delim); + } + + /** + * Constructs a tokenizer splitting using the specified delimiter matcher. + * + * @param input the string which is to be parsed, not cloned + * @param delim the field delimiter matcher + */ + public StrTokenizer(final char[] input, final StrMatcher delim) { + this(input); + setDelimiterMatcher(delim); + } + + /** + * Constructs a tokenizer splitting on the specified delimiter character + * and handling quotes using the specified quote character. + * + * @param input the string which is to be parsed, not cloned + * @param delim the field delimiter character + * @param quote the field quoted string character + */ + public StrTokenizer(final char[] input, final char delim, final char quote) { + this(input, delim); + setQuoteChar(quote); + } + + /** + * Constructs a tokenizer splitting using the specified delimiter matcher + * and handling quotes using the specified quote matcher. + * + * @param input the string which is to be parsed, not cloned + * @param delim the field delimiter character + * @param quote the field quoted string character + */ + public StrTokenizer(final char[] input, final StrMatcher delim, final StrMatcher quote) { + this(input, delim); + setQuoteMatcher(quote); + } + + // API + //----------------------------------------------------------------------- + /** + * Gets the number of tokens found in the String. + * + * @return the number of matched tokens + */ + public int size() { + checkTokenized(); + return tokens.length; + } + + /** + * Gets the next token from the String. + * Equivalent to {@link #next()} except it returns null rather than + * throwing {@link NoSuchElementException} when no tokens remain. + * + * @return the next sequential token, or null when no more tokens are found + */ + public String nextToken() { + if (hasNext()) { + return tokens[tokenPos++]; + } + return null; + } + + /** + * Gets the previous token from the String. + * + * @return the previous sequential token, or null when no more tokens are found + */ + public String previousToken() { + if (hasPrevious()) { + return tokens[--tokenPos]; + } + return null; + } + + /** + * Gets a copy of the full token list as an independent modifiable array. + * + * @return the tokens as a String array + */ + public String[] getTokenArray() { + checkTokenized(); + return tokens.clone(); + } + + /** + * Gets a copy of the full token list as an independent modifiable list. + * + * @return the tokens as a String array + */ + public List<String> getTokenList() { + checkTokenized(); + final List<String> list = new ArrayList<>(tokens.length); + for (final String element : tokens) { + list.add(element); + } + return list; + } + + /** + * Resets this tokenizer, forgetting all parsing and iteration already completed. + * <p> + * This method allows the same tokenizer to be reused for the same String. + * + * @return this, to enable chaining + */ + public StrTokenizer reset() { + tokenPos = 0; + tokens = null; + return this; + } + + /** + * Reset this tokenizer, giving it a new input string to parse. + * In this manner you can re-use a tokenizer with the same settings + * on multiple input lines. + * + * @param input the new string to tokenize, null sets no text to parse + * @return this, to enable chaining + */ + public StrTokenizer reset(final String input) { + reset(); + if (input != null) { + this.chars = input.toCharArray(); + } else { + this.chars = null; + } + return this; + } + + /** + * Reset this tokenizer, giving it a new input string to parse. + * In this manner you can re-use a tokenizer with the same settings + * on multiple input lines. + * + * @param input the new character array to tokenize, not cloned, null sets no text to parse + * @return this, to enable chaining + */ + public StrTokenizer reset(final char[] input) { + reset(); + if (input != null) { + this.chars = input.clone(); + } else { + this.chars = null; + } + return this; + } + + // ListIterator + //----------------------------------------------------------------------- + /** + * Checks whether there are any more tokens. + * + * @return true if there are more tokens + */ + @Override + public boolean hasNext() { + checkTokenized(); + return tokenPos < tokens.length; + } + + /** + * Gets the next token. + * + * @return the next String token + * @throws NoSuchElementException if there are no more elements + */ + @Override + public String next() { + if (hasNext()) { + return tokens[tokenPos++]; + } + throw new NoSuchElementException(); + } + + /** + * Gets the index of the next token to return. + * + * @return the next token index + */ + @Override + public int nextIndex() { + return tokenPos; + } + + /** + * Checks whether there are any previous tokens that can be iterated to. + * + * @return true if there are previous tokens + */ + @Override + public boolean hasPrevious() { + checkTokenized(); + return tokenPos > 0; + } + + /** + * Gets the token previous to the last returned token. + * + * @return the previous token + */ + @Override + public String previous() { + if (hasPrevious()) { + return tokens[--tokenPos]; + } + throw new NoSuchElementException(); + } + + /** + * Gets the index of the previous token. + * + * @return the previous token index + */ + @Override + public int previousIndex() { + return tokenPos - 1; + } + + /** + * Unsupported ListIterator operation. + * + * @throws UnsupportedOperationException always + */ + @Override + public void remove() { + throw new UnsupportedOperationException("remove() is unsupported"); + } + + /** + * Unsupported ListIterator operation. + * @param obj this parameter ignored. + * @throws UnsupportedOperationException always + */ + @Override + public void set(final String obj) { + throw new UnsupportedOperationException("set() is unsupported"); + } + + /** + * Unsupported ListIterator operation. + * @param obj this parameter ignored. + * @throws UnsupportedOperationException always + */ + @Override + public void add(final String obj) { + throw new UnsupportedOperationException("add() is unsupported"); + } + + // Implementation + //----------------------------------------------------------------------- + /** + * Checks if tokenization has been done, and if not then do it. + */ + private void checkTokenized() { + if (tokens == null) { + if (chars == null) { + // still call tokenize as subclass may do some work + final List<String> split = tokenize(null, 0, 0); + tokens = split.toArray(new String[split.size()]); + } else { + final List<String> split = tokenize(chars, 0, chars.length); + tokens = split.toArray(new String[split.size()]); + } + } + } + + /** + * Internal method to performs the tokenization. + * <p> + * Most users of this class do not need to call this method. This method + * will be called automatically by other (public) methods when required. + * <p> + * This method exists to allow subclasses to add code before or after the + * tokenization. For example, a subclass could alter the character array, + * offset or count to be parsed, or call the tokenizer multiple times on + * multiple strings. It is also be possible to filter the results. + * <p> + * <code>StrTokenizer</code> will always pass a zero offset and a count + * equal to the length of the array to this method, however a subclass + * may pass other values, or even an entirely different array. + * + * @param srcChars the character array being tokenized, may be null + * @param offset the start position within the character array, must be valid + * @param count the number of characters to tokenize, must be valid + * @return the modifiable list of String tokens, unmodifiable if null array or zero count + */ + protected List<String> tokenize(final char[] srcChars, final int offset, final int count) { + if (srcChars == null || count == 0) { + return Collections.emptyList(); + } + final StrBuilder buf = new StrBuilder(); + final List<String> tokenList = new ArrayList<>(); + int pos = offset; + + // loop around the entire buffer + while (pos >= 0 && pos < count) { + // find next token + pos = readNextToken(srcChars, pos, count, buf, tokenList); + + // handle case where end of string is a delimiter + if (pos >= count) { + addToken(tokenList, ""); + } + } + return tokenList; + } + + /** + * Adds a token to a list, paying attention to the parameters we've set. + * + * @param list the list to add to + * @param tok the token to add + */ + private void addToken(final List<String> list, String tok) { + if (tok == null || tok.length() == 0) { + if (isIgnoreEmptyTokens()) { + return; + } + if (isEmptyTokenAsNull()) { + tok = null; + } + } + list.add(tok); + } + + /** + * Reads character by character through the String to get the next token. + * + * @param srcChars the character array being tokenized + * @param start the first character of field + * @param len the length of the character array being tokenized + * @param workArea a temporary work area + * @param tokenList the list of parsed tokens + * @return the starting position of the next field (the character + * immediately after the delimiter), or -1 if end of string found + */ + private int readNextToken(final char[] srcChars, int start, final int len, final StrBuilder workArea, final List<String> tokenList) { + // skip all leading whitespace, unless it is the + // field delimiter or the quote character + while (start < len) { + final int removeLen = Math.max( + getIgnoredMatcher().isMatch(srcChars, start, start, len), + getTrimmerMatcher().isMatch(srcChars, start, start, len)); + if (removeLen == 0 || + getDelimiterMatcher().isMatch(srcChars, start, start, len) > 0 || + getQuoteMatcher().isMatch(srcChars, start, start, len) > 0) { + break; + } + start += removeLen; + } + + // handle reaching end + if (start >= len) { + addToken(tokenList, ""); + return -1; + } + + // handle empty token + final int delimLen = getDelimiterMatcher().isMatch(srcChars, start, start, len); + if (delimLen > 0) { + addToken(tokenList, ""); + return start + delimLen; + } + + // handle found token + final int quoteLen = getQuoteMatcher().isMatch(srcChars, start, start, len); + if (quoteLen > 0) { + return readWithQuotes(srcChars, start + quoteLen, len, workArea, tokenList, start, quoteLen); + } + return readWithQuotes(srcChars, start, len, workArea, tokenList, 0, 0); + } + + /** + * Reads a possibly quoted string token. + * + * @param srcChars the character array being tokenized + * @param start the first character of field + * @param len the length of the character array being tokenized + * @param workArea a temporary work area + * @param tokenList the list of parsed tokens + * @param quoteStart the start position of the matched quote, 0 if no quoting + * @param quoteLen the length of the matched quote, 0 if no quoting + * @return the starting position of the next field (the character + * immediately after the delimiter, or if end of string found, + * then the length of string + */ + private int readWithQuotes(final char[] srcChars, final int start, final int len, final StrBuilder workArea, + final List<String> tokenList, final int quoteStart, final int quoteLen) { + // Loop until we've found the end of the quoted + // string or the end of the input + workArea.clear(); + int pos = start; + boolean quoting = quoteLen > 0; + int trimStart = 0; + + while (pos < len) { + // quoting mode can occur several times throughout a string + // we must switch between quoting and non-quoting until we + // encounter a non-quoted delimiter, or end of string + if (quoting) { + // In quoting mode + + // If we've found a quote character, see if it's + // followed by a second quote. If so, then we need + // to actually put the quote character into the token + // rather than end the token. + if (isQuote(srcChars, pos, len, quoteStart, quoteLen)) { + if (isQuote(srcChars, pos + quoteLen, len, quoteStart, quoteLen)) { + // matched pair of quotes, thus an escaped quote + workArea.append(srcChars, pos, quoteLen); + pos += quoteLen * 2; + trimStart = workArea.size(); + continue; + } + + // end of quoting + quoting = false; + pos += quoteLen; + continue; + } + + // copy regular character from inside quotes + workArea.append(srcChars[pos++]); + trimStart = workArea.size(); + + } else { + // Not in quoting mode + + // check for delimiter, and thus end of token + final int delimLen = getDelimiterMatcher().isMatch(srcChars, pos, start, len); + if (delimLen > 0) { + // return condition when end of token found + addToken(tokenList, workArea.substring(0, trimStart)); + return pos + delimLen; + } + + // check for quote, and thus back into quoting mode + if (quoteLen > 0 && isQuote(srcChars, pos, len, quoteStart, quoteLen)) { + quoting = true; + pos += quoteLen; + continue; + } + + // check for ignored (outside quotes), and ignore + final int ignoredLen = getIgnoredMatcher().isMatch(srcChars, pos, start, len); + if (ignoredLen > 0) { + pos += ignoredLen; + continue; + } + + // check for trimmed character + // don't yet know if its at the end, so copy to workArea + // use trimStart to keep track of trim at the end + final int trimmedLen = getTrimmerMatcher().isMatch(srcChars, pos, start, len); + if (trimmedLen > 0) { + workArea.append(srcChars, pos, trimmedLen); + pos += trimmedLen; + continue; + } + + // copy regular character from outside quotes + workArea.append(srcChars[pos++]); + trimStart = workArea.size(); + } + } + + // return condition when end of string found + addToken(tokenList, workArea.substring(0, trimStart)); + return -1; + } + + /** + * Checks if the characters at the index specified match the quote + * already matched in readNextToken(). + * + * @param srcChars the character array being tokenized + * @param pos the position to check for a quote + * @param len the length of the character array being tokenized + * @param quoteStart the start position of the matched quote, 0 if no quoting + * @param quoteLen the length of the matched quote, 0 if no quoting + * @return true if a quote is matched + */ + private boolean isQuote(final char[] srcChars, final int pos, final int len, final int quoteStart, final int quoteLen) { + for (int i = 0; i < quoteLen; i++) { + if (pos + i >= len || srcChars[pos + i] != srcChars[quoteStart + i]) { + return false; + } + } + return true; + } + + // Delimiter + //----------------------------------------------------------------------- + /** + * Gets the field delimiter matcher. + * + * @return the delimiter matcher in use + */ + public StrMatcher getDelimiterMatcher() { + return this.delimMatcher; + } + + /** + * Sets the field delimiter matcher. + * <p> + * The delimitier is used to separate one token from another. + * + * @param delim the delimiter matcher to use + * @return this, to enable chaining + */ + public StrTokenizer setDelimiterMatcher(final StrMatcher delim) { + if (delim == null) { + this.delimMatcher = StrMatcher.noneMatcher(); + } else { + this.delimMatcher = delim; + } + return this; + } + + /** + * Sets the field delimiter character. + * + * @param delim the delimiter character to use + * @return this, to enable chaining + */ + public StrTokenizer setDelimiterChar(final char delim) { + return setDelimiterMatcher(StrMatcher.charMatcher(delim)); + } + + /** + * Sets the field delimiter string. + * + * @param delim the delimiter string to use + * @return this, to enable chaining + */ + public StrTokenizer setDelimiterString(final String delim) { + return setDelimiterMatcher(StrMatcher.stringMatcher(delim)); + } + + // Quote + //----------------------------------------------------------------------- + /** + * Gets the quote matcher currently in use. + * <p> + * The quote character is used to wrap data between the tokens. + * This enables delimiters to be entered as data. + * The default value is '"' (double quote). + * + * @return the quote matcher in use + */ + public StrMatcher getQuoteMatcher() { + return quoteMatcher; + } + + /** + * Set the quote matcher to use. + * <p> + * The quote character is used to wrap data between the tokens. + * This enables delimiters to be entered as data. + * + * @param quote the quote matcher to use, null ignored + * @return this, to enable chaining + */ + public StrTokenizer setQuoteMatcher(final StrMatcher quote) { + if (quote != null) { + this.quoteMatcher = quote; + } + return this; + } + + /** + * Sets the quote character to use. + * <p> + * The quote character is used to wrap data between the tokens. + * This enables delimiters to be entered as data. + * + * @param quote the quote character to use + * @return this, to enable chaining + */ + public StrTokenizer setQuoteChar(final char quote) { + return setQuoteMatcher(StrMatcher.charMatcher(quote)); + } + + // Ignored + //----------------------------------------------------------------------- + /** + * Gets the ignored character matcher. + * <p> + * These characters are ignored when parsing the String, unless they are + * within a quoted region. + * The default value is not to ignore anything. + * + * @return the ignored matcher in use + */ + public StrMatcher getIgnoredMatcher() { + return ignoredMatcher; + } + + /** + * Set the matcher for characters to ignore. + * <p> + * These characters are ignored when parsing the String, unless they are + * within a quoted region. + * + * @param ignored the ignored matcher to use, null ignored + * @return this, to enable chaining + */ + public StrTokenizer setIgnoredMatcher(final StrMatcher ignored) { + if (ignored != null) { + this.ignoredMatcher = ignored; + } + return this; + } + + /** + * Set the character to ignore. + * <p> + * This character is ignored when parsing the String, unless it is + * within a quoted region. + * + * @param ignored the ignored character to use + * @return this, to enable chaining + */ + public StrTokenizer setIgnoredChar(final char ignored) { + return setIgnoredMatcher(StrMatcher.charMatcher(ignored)); + } + + // Trimmer + //----------------------------------------------------------------------- + /** + * Gets the trimmer character matcher. + * <p> + * These characters are trimmed off on each side of the delimiter + * until the token or quote is found. + * The default value is not to trim anything. + * + * @return the trimmer matcher in use + */ + public StrMatcher getTrimmerMatcher() { + return trimmerMatcher; + } + + /** + * Sets the matcher for characters to trim. + * <p> + * These characters are trimmed off on each side of the delimiter + * until the token or quote is found. + * + * @param trimmer the trimmer matcher to use, null ignored + * @return this, to enable chaining + */ + public StrTokenizer setTrimmerMatcher(final StrMatcher trimmer) { + if (trimmer != null) { + this.trimmerMatcher = trimmer; + } + return this; + } + + //----------------------------------------------------------------------- + /** + * Gets whether the tokenizer currently returns empty tokens as null. + * The default for this property is false. + * + * @return true if empty tokens are returned as null + */ + public boolean isEmptyTokenAsNull() { + return this.emptyAsNull; + } + + /** + * Sets whether the tokenizer should return empty tokens as null. + * The default for this property is false. + * + * @param emptyAsNull whether empty tokens are returned as null + * @return this, to enable chaining + */ + public StrTokenizer setEmptyTokenAsNull(final boolean emptyAsNull) { + this.emptyAsNull = emptyAsNull; + return this; + } + + //----------------------------------------------------------------------- + /** + * Gets whether the tokenizer currently ignores empty tokens. + * The default for this property is true. + * + * @return true if empty tokens are not returned + */ + public boolean isIgnoreEmptyTokens() { + return ignoreEmptyTokens; + } + + /** + * Sets whether the tokenizer should ignore and not return empty tokens. + * The default for this property is true. + * + * @param ignoreEmptyTokens whether empty tokens are not returned + * @return this, to enable chaining + */ + public StrTokenizer setIgnoreEmptyTokens(final boolean ignoreEmptyTokens) { + this.ignoreEmptyTokens = ignoreEmptyTokens; + return this; + } + + //----------------------------------------------------------------------- + /** + * Gets the String content that the tokenizer is parsing. + * + * @return the string content being parsed + */ + public String getContent() { + if (chars == null) { + return null; + } + return new String(chars); + } + + //----------------------------------------------------------------------- + /** + * Creates a new instance of this Tokenizer. The new instance is reset so + * that it will be at the start of the token list. + * If a {@link CloneNotSupportedException} is caught, return <code>null</code>. + * + * @return a new instance of this Tokenizer which has been reset. + */ + @Override + public Object clone() { + try { + return cloneReset(); + } catch (final CloneNotSupportedException ex) { + return null; + } + } + + /** + * Creates a new instance of this Tokenizer. The new instance is reset so that + * it will be at the start of the token list. + * + * @return a new instance of this Tokenizer which has been reset. + * @throws CloneNotSupportedException if there is a problem cloning + */ + Object cloneReset() throws CloneNotSupportedException { + // this method exists to enable 100% test coverage + final StrTokenizer cloned = (StrTokenizer) super.clone(); + if (cloned.chars != null) { + cloned.chars = cloned.chars.clone(); + } + cloned.reset(); + return cloned; + } + + //----------------------------------------------------------------------- + /** + * Gets the String content that the tokenizer is parsing. + * + * @return the string content being parsed + */ + @Override + public String toString() { + if (tokens == null) { + return "StrTokenizer[not tokenized yet]"; + } + return "StrTokenizer" + getTokenList(); + } + +}
http://git-wip-us.apache.org/repos/asf/commons-text/blob/c7cf533d/src/main/java/org/apache/commons/text/StringEscapeUtils.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/text/StringEscapeUtils.java b/src/main/java/org/apache/commons/text/StringEscapeUtils.java new file mode 100644 index 0000000..aa6b071 --- /dev/null +++ b/src/main/java/org/apache/commons/text/StringEscapeUtils.java @@ -0,0 +1,959 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.text; + +import org.apache.commons.lang3.StringUtils; +import org.apache.commons.text.translate.AggregateTranslator; +import org.apache.commons.text.translate.CharSequenceTranslator; +import org.apache.commons.text.translate.CsvTranslators; +import org.apache.commons.text.translate.EntityArrays; +import org.apache.commons.text.translate.JavaUnicodeEscaper; +import org.apache.commons.text.translate.LookupTranslator; +import org.apache.commons.text.translate.NumericEntityEscaper; +import org.apache.commons.text.translate.NumericEntityUnescaper; +import org.apache.commons.text.translate.OctalUnescaper; +import org.apache.commons.text.translate.SingleLookupTranslator; +import org.apache.commons.text.translate.UnicodeUnescaper; +import org.apache.commons.text.translate.UnicodeUnpairedSurrogateRemover; + +import java.io.IOException; +import java.io.Writer; +import java.util.Collections; +import java.util.HashMap; +import java.util.Map; + +/** + * <p>Escapes and unescapes {@code String}s for + * Java, Java Script, HTML and XML.</p> + * + * <p>#ThreadSafe#</p> + * + * + * <p> + * This code has been adapted from Apache Commons Lang 3.5. + * </p> + * + * @since 1.0 + */ +public class StringEscapeUtils { + + /* ESCAPE TRANSLATORS */ + + /** + * Translator object for escaping Java. + * + * While {@link #escapeJava(String)} is the expected method of use, this + * object allows the Java escaping functionality to be used + * as the foundation for a custom translator. + */ + public static final CharSequenceTranslator ESCAPE_JAVA; + static { + Map<CharSequence, CharSequence> escapeJavaMap = new HashMap<>(); + escapeJavaMap.put("\"", "\\\""); + escapeJavaMap.put("\\", "\\\\"); + ESCAPE_JAVA = new AggregateTranslator( + new LookupTranslator(Collections.unmodifiableMap(escapeJavaMap)), + new LookupTranslator(EntityArrays.JAVA_CTRL_CHARS_ESCAPE), + JavaUnicodeEscaper.outsideOf(32, 0x7f) + ); + } + + /** + * Translator object for escaping EcmaScript/JavaScript. + * + * While {@link #escapeEcmaScript(String)} is the expected method of use, this + * object allows the EcmaScript escaping functionality to be used + * as the foundation for a custom translator. + */ + public static final CharSequenceTranslator ESCAPE_ECMASCRIPT; + static { + Map<CharSequence, CharSequence> escapeEcmaScriptMap = new HashMap<>(); + escapeEcmaScriptMap.put("'", "\\'"); + escapeEcmaScriptMap.put("\"", "\\\""); + escapeEcmaScriptMap.put("\\", "\\\\"); + escapeEcmaScriptMap.put("/", "\\/"); + ESCAPE_ECMASCRIPT = new AggregateTranslator( + new LookupTranslator(Collections.unmodifiableMap(escapeEcmaScriptMap)), + new LookupTranslator(EntityArrays.JAVA_CTRL_CHARS_ESCAPE), + JavaUnicodeEscaper.outsideOf(32, 0x7f) + ); + } + + /** + * Translator object for escaping Json. + * + * While {@link #escapeJson(String)} is the expected method of use, this + * object allows the Json escaping functionality to be used + * as the foundation for a custom translator. + */ + public static final CharSequenceTranslator ESCAPE_JSON; + static { + Map<CharSequence, CharSequence> escapeJsonMap = new HashMap<>(); + escapeJsonMap.put("\"", "\\\""); + escapeJsonMap.put("\\", "\\\\"); + escapeJsonMap.put("/", "\\/"); + ESCAPE_JSON = new AggregateTranslator( + new LookupTranslator(Collections.unmodifiableMap(escapeJsonMap)), + new LookupTranslator(EntityArrays.JAVA_CTRL_CHARS_ESCAPE), + JavaUnicodeEscaper.outsideOf(32, 0x7f) + ); + } + + /** + * Translator object for escaping XML 1.0. + * + * While {@link #escapeXml10(String)} is the expected method of use, this + * object allows the XML escaping functionality to be used + * as the foundation for a custom translator. + */ + public static final CharSequenceTranslator ESCAPE_XML10; + static { + Map<CharSequence, CharSequence> escapeXml10Map = new HashMap<>(); + escapeXml10Map.put("\u0000", StringUtils.EMPTY); + escapeXml10Map.put("\u0001", StringUtils.EMPTY); + escapeXml10Map.put("\u0002", StringUtils.EMPTY); + escapeXml10Map.put("\u0003", StringUtils.EMPTY); + escapeXml10Map.put("\u0004", StringUtils.EMPTY); + escapeXml10Map.put("\u0005", StringUtils.EMPTY); + escapeXml10Map.put("\u0006", StringUtils.EMPTY); + escapeXml10Map.put("\u0007", StringUtils.EMPTY); + escapeXml10Map.put("\u0008", StringUtils.EMPTY); + escapeXml10Map.put("\u000b", StringUtils.EMPTY); + escapeXml10Map.put("\u000c", StringUtils.EMPTY); + escapeXml10Map.put("\u000e", StringUtils.EMPTY); + escapeXml10Map.put("\u000f", StringUtils.EMPTY); + escapeXml10Map.put("\u0010", StringUtils.EMPTY); + escapeXml10Map.put("\u0011", StringUtils.EMPTY); + escapeXml10Map.put("\u0012", StringUtils.EMPTY); + escapeXml10Map.put("\u0013", StringUtils.EMPTY); + escapeXml10Map.put("\u0014", StringUtils.EMPTY); + escapeXml10Map.put("\u0015", StringUtils.EMPTY); + escapeXml10Map.put("\u0016", StringUtils.EMPTY); + escapeXml10Map.put("\u0017", StringUtils.EMPTY); + escapeXml10Map.put("\u0018", StringUtils.EMPTY); + escapeXml10Map.put("\u0019", StringUtils.EMPTY); + escapeXml10Map.put("\u001a", StringUtils.EMPTY); + escapeXml10Map.put("\u001b", StringUtils.EMPTY); + escapeXml10Map.put("\u001c", StringUtils.EMPTY); + escapeXml10Map.put("\u001d", StringUtils.EMPTY); + escapeXml10Map.put("\u001e", StringUtils.EMPTY); + escapeXml10Map.put("\u001f", StringUtils.EMPTY); + escapeXml10Map.put("\ufffe", StringUtils.EMPTY); + escapeXml10Map.put("\uffff", StringUtils.EMPTY); + ESCAPE_XML10 = new AggregateTranslator( + new LookupTranslator(EntityArrays.BASIC_ESCAPE), + new LookupTranslator(EntityArrays.APOS_ESCAPE), + new LookupTranslator(Collections.unmodifiableMap(escapeXml10Map)), + NumericEntityEscaper.between(0x7f, 0x84), + NumericEntityEscaper.between(0x86, 0x9f), + new UnicodeUnpairedSurrogateRemover() + ); + } + + /** + * Translator object for escaping XML 1.1. + * + * While {@link #escapeXml11(String)} is the expected method of use, this + * object allows the XML escaping functionality to be used + * as the foundation for a custom translator. + */ + public static final CharSequenceTranslator ESCAPE_XML11; + static { + Map<CharSequence, CharSequence> escapeXml11Map = new HashMap<>(); + escapeXml11Map.put("\u0000", StringUtils.EMPTY); + escapeXml11Map.put("\u000b", ""); + escapeXml11Map.put("\u000c", ""); + escapeXml11Map.put("\ufffe", StringUtils.EMPTY); + escapeXml11Map.put("\uffff", StringUtils.EMPTY); + ESCAPE_XML11 = new AggregateTranslator( + new LookupTranslator(EntityArrays.BASIC_ESCAPE), + new LookupTranslator(EntityArrays.APOS_ESCAPE), + new LookupTranslator(Collections.unmodifiableMap(escapeXml11Map)), + NumericEntityEscaper.between(0x1, 0x8), + NumericEntityEscaper.between(0xe, 0x1f), + NumericEntityEscaper.between(0x7f, 0x84), + NumericEntityEscaper.between(0x86, 0x9f), + new UnicodeUnpairedSurrogateRemover() + ); + } + + /** + * Translator object for escaping HTML version 3.0. + * + * While {@link #escapeHtml3(String)} is the expected method of use, this + * object allows the HTML escaping functionality to be used + * as the foundation for a custom translator. + */ + public static final CharSequenceTranslator ESCAPE_HTML3 = + new AggregateTranslator( + new LookupTranslator(EntityArrays.BASIC_ESCAPE), + new LookupTranslator(EntityArrays.ISO8859_1_ESCAPE) + ); + + /** + * The improved translator object for escaping HTML version 3.0. + * The 'improved' part of this translator is that it checks if the html is already translated. + * This check prevents double, triple, or recursive translations. + * + * While {@link #escapeHtml3Once(String)} is the expected method of use, this + * object allows the HTML escaping functionality to be used + * as the foundation for a custom translator. + * + * Note that, multiple lookup tables should be passed to this translator + * instead of passing multiple instances of this translator to the + * AggregateTranslator. Because, a SingleLookupTranslator only checks the values of the + * lookup table passed to that instance while deciding whether a value is + * already translated or not. + */ + public static final CharSequenceTranslator ESCAPE_HTML3_ONCE = + new SingleLookupTranslator(EntityArrays.BASIC_ESCAPE, EntityArrays.ISO8859_1_ESCAPE); + + + /** + * Translator object for escaping HTML version 4.0. + * + * While {@link #escapeHtml4(String)} is the expected method of use, this + * object allows the HTML escaping functionality to be used + * as the foundation for a custom translator. + */ + public static final CharSequenceTranslator ESCAPE_HTML4 = + new AggregateTranslator( + new LookupTranslator(EntityArrays.BASIC_ESCAPE), + new LookupTranslator(EntityArrays.ISO8859_1_ESCAPE), + new LookupTranslator(EntityArrays.HTML40_EXTENDED_ESCAPE) + ); + + /** + * The improved translator object for escaping HTML version 4.0. + * The 'improved' part of this translator is that it checks if the html is already translated. + * This check prevents double, triple, or recursive translations. + * + * While {@link #escapeHtml4Once(String)} is the expected method of use, this + * object allows the HTML escaping functionality to be used + * as the foundation for a custom translator. + * + * Note that, multiple lookup tables should be passed to this translator + * instead of passing multiple instances of this translator to the + * AggregateTranslator. Because, a SingleLookupTranslator only checks the values of the + * lookup table passed to that instance while deciding whether a value is + * already translated or not. + */ + public static final CharSequenceTranslator ESCAPE_HTML4_ONCE = + new SingleLookupTranslator( + EntityArrays.BASIC_ESCAPE, + EntityArrays.ISO8859_1_ESCAPE, + EntityArrays.HTML40_EXTENDED_ESCAPE + ); + + /** + * Translator object for escaping individual Comma Separated Values. + * + * While {@link #escapeCsv(String)} is the expected method of use, this + * object allows the CSV escaping functionality to be used + * as the foundation for a custom translator. + */ + public static final CharSequenceTranslator ESCAPE_CSV = new CsvTranslators.CsvEscaper(); + + /** + * Translator object for escaping Shell command language. + * + * @see <a href="http://pubs.opengroup.org/onlinepubs/7908799/xcu/chap2.html">Shell Command Language</a> + */ + public static final CharSequenceTranslator ESCAPE_XSI; + static { + Map<CharSequence, CharSequence> escapeXsiMap = new HashMap<>(); + escapeXsiMap.put("|", "\\|"); + escapeXsiMap.put("&", "\\&"); + escapeXsiMap.put(";", "\\;"); + escapeXsiMap.put("<", "\\<"); + escapeXsiMap.put(">", "\\>"); + escapeXsiMap.put("(", "\\("); + escapeXsiMap.put(")", "\\)"); + escapeXsiMap.put("$", "\\$"); + escapeXsiMap.put("`", "\\`"); + escapeXsiMap.put("\\", "\\\\"); + escapeXsiMap.put("\"", "\\\""); + escapeXsiMap.put("'", "\\'"); + escapeXsiMap.put(" ", "\\ "); + escapeXsiMap.put("\t", "\\\t"); + escapeXsiMap.put("\r\n", ""); + escapeXsiMap.put("\n", ""); + escapeXsiMap.put("*", "\\*"); + escapeXsiMap.put("?", "\\?"); + escapeXsiMap.put("[", "\\["); + escapeXsiMap.put("#", "\\#"); + escapeXsiMap.put("~", "\\~"); + escapeXsiMap.put("=", "\\="); + escapeXsiMap.put("%", "\\%"); + ESCAPE_XSI = new LookupTranslator( + Collections.unmodifiableMap(escapeXsiMap) + ); + } + + /* UNESCAPE TRANSLATORS */ + + /** + * Translator object for unescaping escaped Java. + * + * While {@link #unescapeJava(String)} is the expected method of use, this + * object allows the Java unescaping functionality to be used + * as the foundation for a custom translator. + */ + // TODO: throw "illegal character: \92" as an Exception if a \ on the end of the Java (as per the compiler)? + public static final CharSequenceTranslator UNESCAPE_JAVA; + static { + Map<CharSequence, CharSequence> unescapeJavaMap = new HashMap<>(); + unescapeJavaMap.put("\\\\", "\\"); + unescapeJavaMap.put("\\\"", "\""); + unescapeJavaMap.put("\\'", "'"); + unescapeJavaMap.put("\\", ""); + UNESCAPE_JAVA = new AggregateTranslator( + new OctalUnescaper(), // .between('\1', '\377'), + new UnicodeUnescaper(), + new LookupTranslator(EntityArrays.JAVA_CTRL_CHARS_UNESCAPE), + new LookupTranslator(Collections.unmodifiableMap(unescapeJavaMap)) + ); + } + + /** + * Translator object for unescaping escaped EcmaScript. + * + * While {@link #unescapeEcmaScript(String)} is the expected method of use, this + * object allows the EcmaScript unescaping functionality to be used + * as the foundation for a custom translator. + */ + public static final CharSequenceTranslator UNESCAPE_ECMASCRIPT = UNESCAPE_JAVA; + + /** + * Translator object for unescaping escaped Json. + * + * While {@link #unescapeJson(String)} is the expected method of use, this + * object allows the Json unescaping functionality to be used + * as the foundation for a custom translator. + */ + public static final CharSequenceTranslator UNESCAPE_JSON = UNESCAPE_JAVA; + + /** + * Translator object for unescaping escaped HTML 3.0. + * + * While {@link #unescapeHtml3(String)} is the expected method of use, this + * object allows the HTML unescaping functionality to be used + * as the foundation for a custom translator. + */ + public static final CharSequenceTranslator UNESCAPE_HTML3 = + new AggregateTranslator( + new LookupTranslator(EntityArrays.BASIC_UNESCAPE), + new LookupTranslator(EntityArrays.ISO8859_1_UNESCAPE), + new NumericEntityUnescaper() + ); + + /** + * Translator object for unescaping escaped HTML 4.0. + * + * While {@link #unescapeHtml4(String)} is the expected method of use, this + * object allows the HTML unescaping functionality to be used + * as the foundation for a custom translator. + */ + public static final CharSequenceTranslator UNESCAPE_HTML4 = + new AggregateTranslator( + new LookupTranslator(EntityArrays.BASIC_UNESCAPE), + new LookupTranslator(EntityArrays.ISO8859_1_UNESCAPE), + new LookupTranslator(EntityArrays.HTML40_EXTENDED_UNESCAPE), + new NumericEntityUnescaper() + ); + + /** + * Translator object for unescaping escaped XML. + * + * While {@link #unescapeXml(String)} is the expected method of use, this + * object allows the XML unescaping functionality to be used + * as the foundation for a custom translator. + */ + public static final CharSequenceTranslator UNESCAPE_XML = + new AggregateTranslator( + new LookupTranslator(EntityArrays.BASIC_UNESCAPE), + new LookupTranslator(EntityArrays.APOS_UNESCAPE), + new NumericEntityUnescaper() + ); + + /** + * Translator object for unescaping escaped Comma Separated Value entries. + * + * While {@link #unescapeCsv(String)} is the expected method of use, this + * object allows the CSV unescaping functionality to be used + * as the foundation for a custom translator. + */ + public static final CharSequenceTranslator UNESCAPE_CSV = new CsvTranslators.CsvUnescaper(); + + /** + * Translator object for unescaping escaped XSI Value entries. + * + * While {@link #unescapeXSI(String)} is the expected method of use, this + * object allows the XSI unescaping functionality to be used + * as the foundation for a custom translator. + */ + public static final CharSequenceTranslator UNESCAPE_XSI = new XsiUnescaper(); + + /** + * Translator object for unescaping backslash escaped entries. + */ + static class XsiUnescaper extends CharSequenceTranslator { + + /** + * Escaped backslash constant. + */ + private static final char BACKSLASH = '\\'; + + @Override + public int translate(final CharSequence input, final int index, final Writer out) throws IOException { + + if (index != 0) { + throw new IllegalStateException("XsiUnescaper should never reach the [1] index"); + } + + String s = input.toString(); + + int segmentStart = 0; + int searchOffset = 0; + while (true) { + int pos = s.indexOf(BACKSLASH, searchOffset); + if (pos == -1) { + if (segmentStart < s.length()) { + out.write(s.substring(segmentStart)); + } + break; + } + if (pos > segmentStart) { + out.write(s.substring(segmentStart, pos)); + } + segmentStart = pos + 1; + searchOffset = pos + 2; + } + + return Character.codePointCount(input, 0, input.length()); + } + } + + /* Helper functions */ + + /** + * <p>{@code StringEscapeUtils} instances should NOT be constructed in + * standard programming.</p> + * + * <p>Instead, the class should be used as:</p> + * <pre>StringEscapeUtils.escapeJava("foo");</pre> + * + * <p>This constructor is public to permit tools that require a JavaBean + * instance to operate.</p> + */ + public StringEscapeUtils() { + super(); + } + + /** + * <p>Convenience wrapper for {@link java.lang.StringBuilder} providing escape methods.</p> + * + * <p>Example:</p> + * <pre> + * new Builder(ESCAPE_HTML4) + * .append("<p>") + * .escape("This is paragraph 1 and special chars like & get escaped.") + * .append("</p><p>") + * .escape("This is paragraph 2 & more...") + * .append("</p>") + * .toString() + * </pre> + * + */ + public static final class Builder { + + /** + * StringBuilder to be used in the Builder class. + */ + private final StringBuilder sb; + + /** + * CharSequenceTranslator to be used in the Builder class. + */ + private final CharSequenceTranslator translator; + + /** + * Builder constructor. + * + * @param translator a CharSequenceTranslator. + */ + private Builder(final CharSequenceTranslator translator) { + this.sb = new StringBuilder(); + this.translator = translator; + } + + /** + * <p>Escape {@code input} according to the given {@link CharSequenceTranslator}.</p> + * + * @param input the String to escape + * @return {@code this}, to enable chaining + */ + public Builder escape(final String input) { + sb.append(translator.translate(input)); + return this; + } + + /** + * Literal append, no escaping being done. + * + * @param input the String to append + * @return {@code this}, to enable chaining + */ + public Builder append(final String input) { + sb.append(input); + return this; + } + + /** + * <p>Return the escaped string.</p> + * + * @return the escaped string + */ + @Override + public String toString() { + return sb.toString(); + } + } + + /** + * Get a {@link Builder}. + * @param translator the text translator + * @return {@link Builder} + */ + public static StringEscapeUtils.Builder builder(final CharSequenceTranslator translator) { + return new Builder(translator); + } + + // Java and JavaScript + //-------------------------------------------------------------------------- + /** + * <p>Escapes the characters in a {@code String} using Java String rules.</p> + * + * <p>Deals correctly with quotes and control-chars (tab, backslash, cr, ff, etc.) </p> + * + * <p>So a tab becomes the characters {@code '\\'} and + * {@code 't'}.</p> + * + * <p>The only difference between Java strings and JavaScript strings + * is that in JavaScript, a single quote and forward-slash (/) are escaped.</p> + * + * <p>Example:</p> + * <pre> + * input string: He didn't say, "Stop!" + * output string: He didn't say, \"Stop!\" + * </pre> + * + * @param input String to escape values in, may be null + * @return String with escaped values, {@code null} if null string input + */ + public static final String escapeJava(final String input) { + return ESCAPE_JAVA.translate(input); + } + + /** + * <p>Escapes the characters in a {@code String} using EcmaScript String rules.</p> + * <p>Escapes any values it finds into their EcmaScript String form. + * Deals correctly with quotes and control-chars (tab, backslash, cr, ff, etc.) </p> + * + * <p>So a tab becomes the characters {@code '\\'} and + * {@code 't'}.</p> + * + * <p>The only difference between Java strings and EcmaScript strings + * is that in EcmaScript, a single quote and forward-slash (/) are escaped.</p> + * + * <p>Note that EcmaScript is best known by the JavaScript and ActionScript dialects. </p> + * + * <p>Example:</p> + * <pre> + * input string: He didn't say, "Stop!" + * output string: He didn\'t say, \"Stop!\" + * </pre> + * + * <b>Security Note.</b> We only provide backslash escaping in this method. For example, {@code '\"'} has the output + * {@code '\\\"'} which could result in potential issues in the case where the string being escaped is being used + * in an HTML tag like {@code <select onmouseover="..." />}. If you wish to have more rigorous string escaping, you + * may consider the + * <a href="https://www.owasp.org/index.php/Category:OWASP_Enterprise_Security_API_JAVA">ESAPI Libraries</a>. + * Further, you can view the <a href="https://github.com/esapi">ESAPI GitHub Org</a>. + * + * @param input String to escape values in, may be null + * @return String with escaped values, {@code null} if null string input + */ + public static final String escapeEcmaScript(final String input) { + return ESCAPE_ECMASCRIPT.translate(input); + } + + /** + * <p>Escapes the characters in a {@code String} using Json String rules.</p> + * <p>Escapes any values it finds into their Json String form. + * Deals correctly with quotes and control-chars (tab, backslash, cr, ff, etc.) </p> + * + * <p>So a tab becomes the characters {@code '\\'} and + * {@code 't'}.</p> + * + * <p>The only difference between Java strings and Json strings + * is that in Json, forward-slash (/) is escaped.</p> + * + * <p>See http://www.ietf.org/rfc/rfc4627.txt for further details. </p> + * + * <p>Example:</p> + * <pre> + * input string: He didn't say, "Stop!" + * output string: He didn't say, \"Stop!\" + * </pre> + * + * @param input String to escape values in, may be null + * @return String with escaped values, {@code null} if null string input + */ + public static final String escapeJson(final String input) { + return ESCAPE_JSON.translate(input); + } + + /** + * <p>Unescapes any Java literals found in the {@code String}. + * For example, it will turn a sequence of {@code '\'} and + * {@code 'n'} into a newline character, unless the {@code '\'} + * is preceded by another {@code '\'}.</p> + * + * @param input the {@code String} to unescape, may be null + * @return a new unescaped {@code String}, {@code null} if null string input + */ + public static final String unescapeJava(final String input) { + return UNESCAPE_JAVA.translate(input); + } + + /** + * <p>Unescapes any EcmaScript literals found in the {@code String}.</p> + * + * <p>For example, it will turn a sequence of {@code '\'} and {@code 'n'} + * into a newline character, unless the {@code '\'} is preceded by another + * {@code '\'}.</p> + * + * @see #unescapeJava(String) + * @param input the {@code String} to unescape, may be null + * @return A new unescaped {@code String}, {@code null} if null string input + */ + public static final String unescapeEcmaScript(final String input) { + return UNESCAPE_ECMASCRIPT.translate(input); + } + + /** + * <p>Unescapes any Json literals found in the {@code String}.</p> + * + * <p>For example, it will turn a sequence of {@code '\'} and {@code 'n'} + * into a newline character, unless the {@code '\'} is preceded by another + * {@code '\'}.</p> + * + * @see #unescapeJava(String) + * @param input the {@code String} to unescape, may be null + * @return A new unescaped {@code String}, {@code null} if null string input + */ + public static final String unescapeJson(final String input) { + return UNESCAPE_JSON.translate(input); + } + + // HTML and XML + //-------------------------------------------------------------------------- + /** + * <p>Escapes the characters in a {@code String} using HTML entities.</p> + * + * <p> + * For example: + * </p> + * <p><code>"bread" & "butter"</code></p> + * becomes: + * <p> + * <code>&quot;bread&quot; &amp; &quot;butter&quot;</code>. + * </p> + * + * <p>Supports all known HTML 4.0 entities, including funky accents. + * Note that the commonly used apostrophe escape character (&apos;) + * is not a legal entity and so is not supported). </p> + * + * @param input the {@code String} to escape, may be null + * @return a new escaped {@code String}, {@code null} if null string input + * + * @see <a href="http://hotwired.lycos.com/webmonkey/reference/special_characters/">ISO Entities</a> + * @see <a href="http://www.w3.org/TR/REC-html32#latin1">HTML 3.2 Character Entities for ISO Latin-1</a> + * @see <a href="http://www.w3.org/TR/REC-html40/sgml/entities.html">HTML 4.0 Character entity references</a> + * @see <a href="http://www.w3.org/TR/html401/charset.html#h-5.3">HTML 4.01 Character References</a> + * @see <a href="http://www.w3.org/TR/html401/charset.html#code-position">HTML 4.01 Code positions</a> + */ + public static final String escapeHtml4(final String input) { + return ESCAPE_HTML4.translate(input); + } + + /** + * <p>Escapes the characters in a {@code String} using HTML entities. + * But escapes them only once. i.e. does not escape already escaped characters.</p> + * + * <p> + * For example: + * </p> + * <p><code>"bread" & "butter"</code></p> + * becomes: + * <p> + * <code>&quot;bread&quot; &amp; &quot;butter&quot;</code>. + * </p> + * + * <p> + * But: + * </p> + * <p><code>&quot;bread&quot; &amp; &quot;butter&quot;</code></p> + * remains unaffected. + * + * <p>Supports all known HTML 4.0 entities, including funky accents. + * Note that the commonly used apostrophe escape character (&apos;) + * is not a legal entity and so is not supported). </p> + * + * @param input the {@code String} to escape, may be null + * @return a new escaped {@code String}, {@code null} if null string input + * + * @see <a href="http://hotwired.lycos.com/webmonkey/reference/special_characters/">ISO Entities</a> + * @see <a href="http://www.w3.org/TR/REC-html32#latin1">HTML 3.2 Character Entities for ISO Latin-1</a> + * @see <a href="http://www.w3.org/TR/REC-html40/sgml/entities.html">HTML 4.0 Character entity references</a> + * @see <a href="http://www.w3.org/TR/html401/charset.html#h-5.3">HTML 4.01 Character References</a> + * @see <a href="http://www.w3.org/TR/html401/charset.html#code-position">HTML 4.01 Code positions</a> + */ + public static final String escapeHtml4Once(final String input) { + return ESCAPE_HTML4_ONCE.translate(input); + } + + + /** + * <p>Escapes the characters in a {@code String} using HTML entities.</p> + * <p>Supports only the HTML 3.0 entities. </p> + * + * @param input the {@code String} to escape, may be null + * @return a new escaped {@code String}, {@code null} if null string input + */ + public static final String escapeHtml3(final String input) { + return ESCAPE_HTML3.translate(input); + } + + /** + * <p>Escapes the characters in a {@code String} using HTML entities. + * But escapes them only once. i.e. does not escape already escaped characters.</p> + * <p>Supports only the HTML 3.0 entities. </p> + * + * @param input the {@code String} to escape, may be null + * @return a new escaped {@code String}, {@code null} if null string input + */ + public static final String escapeHtml3Once(final String input) { + return ESCAPE_HTML3_ONCE.translate(input); + } + + //----------------------------------------------------------------------- + /** + * <p>Unescapes a string containing entity escapes to a string + * containing the actual Unicode characters corresponding to the + * escapes. Supports HTML 4.0 entities.</p> + * + * <p>For example, the string {@code "<Français>"} + * will become {@code "<Fran�ais>"}</p> + * + * <p>If an entity is unrecognized, it is left alone, and inserted + * verbatim into the result string. e.g. {@code ">&zzzz;x"} will + * become {@code ">&zzzz;x"}.</p> + * + * @param input the {@code String} to unescape, may be null + * @return a new unescaped {@code String}, {@code null} if null string input + */ + public static final String unescapeHtml4(final String input) { + return UNESCAPE_HTML4.translate(input); + } + + /** + * <p>Unescapes a string containing entity escapes to a string + * containing the actual Unicode characters corresponding to the + * escapes. Supports only HTML 3.0 entities.</p> + * + * @param input the {@code String} to unescape, may be null + * @return a new unescaped {@code String}, {@code null} if null string input + */ + public static final String unescapeHtml3(final String input) { + return UNESCAPE_HTML3.translate(input); + } + + /** + * <p>Escapes the characters in a {@code String} using XML entities.</p> + * + * <p>For example: {@code "bread" & "butter"} => + * {@code "bread" & "butter"}. + * </p> + * + * <p>Note that XML 1.0 is a text-only format: it cannot represent control + * characters or unpaired Unicode surrogate codepoints, even after escaping. + * {@code escapeXml10} will remove characters that do not fit in the + * following ranges:</p> + * + * <p>{@code #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]}</p> + * + * <p>Though not strictly necessary, {@code escapeXml10} will escape + * characters in the following ranges:</p> + * + * <p>{@code [#x7F-#x84] | [#x86-#x9F]}</p> + * + * <p>The returned string can be inserted into a valid XML 1.0 or XML 1.1 + * document. If you want to allow more non-text characters in an XML 1.1 + * document, use {@link #escapeXml11(String)}.</p> + * + * @param input the {@code String} to escape, may be null + * @return a new escaped {@code String}, {@code null} if null string input + * @see #unescapeXml(java.lang.String) + */ + public static String escapeXml10(final String input) { + return ESCAPE_XML10.translate(input); + } + + /** + * <p>Escapes the characters in a {@code String} using XML entities.</p> + * + * <p>For example: {@code "bread" & "butter"} => + * {@code "bread" & "butter"}. + * </p> + * + * <p>XML 1.1 can represent certain control characters, but it cannot represent + * the null byte or unpaired Unicode surrogate codepoints, even after escaping. + * {@code escapeXml11} will remove characters that do not fit in the following + * ranges:</p> + * + * <p>{@code [#x1-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]}</p> + * + * <p>{@code escapeXml11} will escape characters in the following ranges:</p> + * + * <p>{@code [#x1-#x8] | [#xB-#xC] | [#xE-#x1F] | [#x7F-#x84] | [#x86-#x9F]}</p> + * + * <p>The returned string can be inserted into a valid XML 1.1 document. Do not + * use it for XML 1.0 documents.</p> + * + * @param input the {@code String} to escape, may be null + * @return a new escaped {@code String}, {@code null} if null string input + * @see #unescapeXml(java.lang.String) + */ + public static String escapeXml11(final String input) { + return ESCAPE_XML11.translate(input); + } + + //----------------------------------------------------------------------- + /** + * <p>Unescapes a string containing XML entity escapes to a string + * containing the actual Unicode characters corresponding to the + * escapes.</p> + * + * <p>Supports only the five basic XML entities (gt, lt, quot, amp, apos). + * Does not support DTDs or external entities.</p> + * + * <p>Note that numerical \\u Unicode codes are unescaped to their respective + * Unicode characters. This may change in future releases. </p> + * + * @param input the {@code String} to unescape, may be null + * @return a new unescaped {@code String}, {@code null} if null string input + * @see #escapeXml10(String) + * @see #escapeXml11(String) + */ + public static final String unescapeXml(final String input) { + return UNESCAPE_XML.translate(input); + } + + //----------------------------------------------------------------------- + + /** + * <p>Returns a {@code String} value for a CSV column enclosed in double quotes, + * if required.</p> + * + * <p>If the value contains a comma, newline or double quote, then the + * String value is returned enclosed in double quotes.</p> + * + * <p>Any double quote characters in the value are escaped with another double quote.</p> + * + * <p>If the value does not contain a comma, newline or double quote, then the + * String value is returned unchanged.</p> + * + * see <a href="http://en.wikipedia.org/wiki/Comma-separated_values">Wikipedia</a> and + * <a href="http://tools.ietf.org/html/rfc4180">RFC 4180</a>. + * + * @param input the input CSV column String, may be null + * @return the input String, enclosed in double quotes if the value contains a comma, + * newline or double quote, {@code null} if null string input + */ + public static final String escapeCsv(final String input) { + return ESCAPE_CSV.translate(input); + } + + /** + * <p>Returns a {@code String} value for an unescaped CSV column. </p> + * + * <p>If the value is enclosed in double quotes, and contains a comma, newline + * or double quote, then quotes are removed. + * </p> + * + * <p>Any double quote escaped characters (a pair of double quotes) are unescaped + * to just one double quote. </p> + * + * <p>If the value is not enclosed in double quotes, or is and does not contain a + * comma, newline or double quote, then the String value is returned unchanged.</p> + * + * see <a href="http://en.wikipedia.org/wiki/Comma-separated_values">Wikipedia</a> and + * <a href="http://tools.ietf.org/html/rfc4180">RFC 4180</a>. + * + * @param input the input CSV column String, may be null + * @return the input String, with enclosing double quotes removed and embedded double + * quotes unescaped, {@code null} if null string input + */ + public static final String unescapeCsv(final String input) { + return UNESCAPE_CSV.translate(input); + } + + /** + * <p>Escapes the characters in a {@code String} using XSI rules.</p> + * + * <p><b>Beware!</b> In most cases you don't want to escape shell commands but use multi-argument + * methods provided by {@link java.lang.ProcessBuilder} or {@link java.lang.Runtime#exec(String[])} + * instead.</p> + * + * <p>Example:</p> + * <pre> + * input string: He didn't say, "Stop!" + * output string: He\ didn\'t\ say,\ \"Stop!\" + * </pre> + * + * @see <a href="http://pubs.opengroup.org/onlinepubs/7908799/xcu/chap2.html">Shell Command Language</a> + * @param input String to escape values in, may be null + * @return String with escaped values, {@code null} if null string input + */ + public static final String escapeXSI(final String input) { + return ESCAPE_XSI.translate(input); + } + + /** + * <p>Unescapes the characters in a {@code String} using XSI rules.</p> + * + * @see StringEscapeUtils#escapeXSI(String) + * @param input the {@code String} to unescape, may be null + * @return a new unescaped {@code String}, {@code null} if null string input + */ + public static final String unescapeXSI(final String input) { + return UNESCAPE_XSI.translate(input); + } + +}