http://git-wip-us.apache.org/repos/asf/commons-text/blob/c7cf533d/src/main/java/org/apache/commons/text/beta/StrTokenizer.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/text/beta/StrTokenizer.java b/src/main/java/org/apache/commons/text/beta/StrTokenizer.java deleted file mode 100644 index 87e09ba..0000000 --- a/src/main/java/org/apache/commons/text/beta/StrTokenizer.java +++ /dev/null @@ -1,1118 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.text.beta; - -import java.util.ArrayList; -import java.util.Collections; -import java.util.List; -import java.util.ListIterator; -import java.util.NoSuchElementException; - -/** - * Tokenizes a string based based on delimiters (separators) - * and supporting quoting and ignored character concepts. - * <p> - * This class can split a String into many smaller strings. It aims - * to do a similar job to {@link java.util.StringTokenizer StringTokenizer}, - * however it offers much more control and flexibility including implementing - * the <code>ListIterator</code> interface. By default, it is set up - * like <code>StringTokenizer</code>. - * <p> - * The input String is split into a number of <i>tokens</i>. - * Each token is separated from the next String by a <i>delimiter</i>. - * One or more delimiter characters must be specified. - * <p> - * Each token may be surrounded by quotes. - * The <i>quote</i> matcher specifies the quote character(s). - * A quote may be escaped within a quoted section by duplicating itself. - * <p> - * Between each token and the delimiter are potentially characters that need trimming. - * The <i>trimmer</i> matcher specifies these characters. - * One usage might be to trim whitespace characters. - * <p> - * At any point outside the quotes there might potentially be invalid characters. - * The <i>ignored</i> matcher specifies these characters to be removed. - * One usage might be to remove new line characters. - * <p> - * Empty tokens may be removed or returned as null. - * <pre> - * "a,b,c" - Three tokens "a","b","c" (comma delimiter) - * " a, b , c " - Three tokens "a","b","c" (default CSV processing trims whitespace) - * "a, ", b ,", c" - Three tokens "a, " , " b ", ", c" (quoted text untouched) - * </pre> - * <p> - * - * This tokenizer has the following properties and options: - * - * <table summary="Tokenizer Properties"> - * <tr> - * <th>Property</th><th>Type</th><th>Default</th> - * </tr> - * <tr> - * <td>delim</td><td>CharSetMatcher</td><td>{ \t\n\r\f}</td> - * </tr> - * <tr> - * <td>quote</td><td>NoneMatcher</td><td>{}</td> - * </tr> - * <tr> - * <td>ignore</td><td>NoneMatcher</td><td>{}</td> - * </tr> - * <tr> - * <td>emptyTokenAsNull</td><td>boolean</td><td>false</td> - * </tr> - * <tr> - * <td>ignoreEmptyTokens</td><td>boolean</td><td>true</td> - * </tr> - * </table> - * - * @since 1.0 - */ -public class StrTokenizer implements ListIterator<String>, Cloneable { - - private static final StrTokenizer CSV_TOKENIZER_PROTOTYPE; - private static final StrTokenizer TSV_TOKENIZER_PROTOTYPE; - static { - CSV_TOKENIZER_PROTOTYPE = new StrTokenizer(); - CSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StrMatcher.commaMatcher()); - CSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StrMatcher.doubleQuoteMatcher()); - CSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StrMatcher.noneMatcher()); - CSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StrMatcher.trimMatcher()); - CSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false); - CSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false); - - TSV_TOKENIZER_PROTOTYPE = new StrTokenizer(); - TSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StrMatcher.tabMatcher()); - TSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StrMatcher.doubleQuoteMatcher()); - TSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StrMatcher.noneMatcher()); - TSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StrMatcher.trimMatcher()); - TSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false); - TSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false); - } - - /** The text to work on. */ - private char chars[]; - /** The parsed tokens */ - private String tokens[]; - /** The current iteration position */ - private int tokenPos; - - /** The delimiter matcher */ - private StrMatcher delimMatcher = StrMatcher.splitMatcher(); - /** The quote matcher */ - private StrMatcher quoteMatcher = StrMatcher.noneMatcher(); - /** The ignored matcher */ - private StrMatcher ignoredMatcher = StrMatcher.noneMatcher(); - /** The trimmer matcher */ - private StrMatcher trimmerMatcher = StrMatcher.noneMatcher(); - - /** Whether to return empty tokens as null */ - private boolean emptyAsNull = false; - /** Whether to ignore empty tokens */ - private boolean ignoreEmptyTokens = true; - - //----------------------------------------------------------------------- - - /** - * Returns a clone of <code>CSV_TOKENIZER_PROTOTYPE</code>. - * - * @return a clone of <code>CSV_TOKENIZER_PROTOTYPE</code>. - */ - private static StrTokenizer getCSVClone() { - return (StrTokenizer) CSV_TOKENIZER_PROTOTYPE.clone(); - } - - /** - * Gets a new tokenizer instance which parses Comma Separated Value strings - * initializing it with the given input. The default for CSV processing - * will be trim whitespace from both ends (which can be overridden with - * the setTrimmer method). - * <p> - * You must call a "reset" method to set the string which you want to parse. - * @return a new tokenizer instance which parses Comma Separated Value strings - */ - public static StrTokenizer getCSVInstance() { - return getCSVClone(); - } - - /** - * Gets a new tokenizer instance which parses Comma Separated Value strings - * initializing it with the given input. The default for CSV processing - * will be trim whitespace from both ends (which can be overridden with - * the setTrimmer method). - * - * @param input the text to parse - * @return a new tokenizer instance which parses Comma Separated Value strings - */ - public static StrTokenizer getCSVInstance(final String input) { - final StrTokenizer tok = getCSVClone(); - tok.reset(input); - return tok; - } - - /** - * Gets a new tokenizer instance which parses Comma Separated Value strings - * initializing it with the given input. The default for CSV processing - * will be trim whitespace from both ends (which can be overridden with - * the setTrimmer method). - * - * @param input the text to parse - * @return a new tokenizer instance which parses Comma Separated Value strings - */ - public static StrTokenizer getCSVInstance(final char[] input) { - final StrTokenizer tok = getCSVClone(); - tok.reset(input); - return tok; - } - - /** - * Returns a clone of <code>TSV_TOKENIZER_PROTOTYPE</code>. - * - * @return a clone of <code>TSV_TOKENIZER_PROTOTYPE</code>. - */ - private static StrTokenizer getTSVClone() { - return (StrTokenizer) TSV_TOKENIZER_PROTOTYPE.clone(); - } - - - /** - * Gets a new tokenizer instance which parses Tab Separated Value strings. - * The default for CSV processing will be trim whitespace from both ends - * (which can be overridden with the setTrimmer method). - * <p> - * You must call a "reset" method to set the string which you want to parse. - * @return a new tokenizer instance which parses Tab Separated Value strings. - */ - public static StrTokenizer getTSVInstance() { - return getTSVClone(); - } - - /** - * Gets a new tokenizer instance which parses Tab Separated Value strings. - * The default for CSV processing will be trim whitespace from both ends - * (which can be overridden with the setTrimmer method). - * @param input the string to parse - * @return a new tokenizer instance which parses Tab Separated Value strings. - */ - public static StrTokenizer getTSVInstance(final String input) { - final StrTokenizer tok = getTSVClone(); - tok.reset(input); - return tok; - } - - /** - * Gets a new tokenizer instance which parses Tab Separated Value strings. - * The default for CSV processing will be trim whitespace from both ends - * (which can be overridden with the setTrimmer method). - * @param input the string to parse - * @return a new tokenizer instance which parses Tab Separated Value strings. - */ - public static StrTokenizer getTSVInstance(final char[] input) { - final StrTokenizer tok = getTSVClone(); - tok.reset(input); - return tok; - } - - //----------------------------------------------------------------------- - /** - * Constructs a tokenizer splitting on space, tab, newline and formfeed - * as per StringTokenizer, but with no text to tokenize. - * <p> - * This constructor is normally used with {@link #reset(String)}. - */ - public StrTokenizer() { - super(); - this.chars = null; - } - - /** - * Constructs a tokenizer splitting on space, tab, newline and formfeed - * as per StringTokenizer. - * - * @param input the string which is to be parsed - */ - public StrTokenizer(final String input) { - super(); - if (input != null) { - chars = input.toCharArray(); - } else { - chars = null; - } - } - - /** - * Constructs a tokenizer splitting on the specified delimiter character. - * - * @param input the string which is to be parsed - * @param delim the field delimiter character - */ - public StrTokenizer(final String input, final char delim) { - this(input); - setDelimiterChar(delim); - } - - /** - * Constructs a tokenizer splitting on the specified delimiter string. - * - * @param input the string which is to be parsed - * @param delim the field delimiter string - */ - public StrTokenizer(final String input, final String delim) { - this(input); - setDelimiterString(delim); - } - - /** - * Constructs a tokenizer splitting using the specified delimiter matcher. - * - * @param input the string which is to be parsed - * @param delim the field delimiter matcher - */ - public StrTokenizer(final String input, final StrMatcher delim) { - this(input); - setDelimiterMatcher(delim); - } - - /** - * Constructs a tokenizer splitting on the specified delimiter character - * and handling quotes using the specified quote character. - * - * @param input the string which is to be parsed - * @param delim the field delimiter character - * @param quote the field quoted string character - */ - public StrTokenizer(final String input, final char delim, final char quote) { - this(input, delim); - setQuoteChar(quote); - } - - /** - * Constructs a tokenizer splitting using the specified delimiter matcher - * and handling quotes using the specified quote matcher. - * - * @param input the string which is to be parsed - * @param delim the field delimiter matcher - * @param quote the field quoted string matcher - */ - public StrTokenizer(final String input, final StrMatcher delim, final StrMatcher quote) { - this(input, delim); - setQuoteMatcher(quote); - } - - /** - * Constructs a tokenizer splitting on space, tab, newline and formfeed - * as per StringTokenizer. - * - * @param input the string which is to be parsed, not cloned - */ - public StrTokenizer(final char[] input) { - super(); - if (input == null) { - this.chars = null; - } else { - this.chars = input.clone(); - } - } - - /** - * Constructs a tokenizer splitting on the specified character. - * - * @param input the string which is to be parsed, not cloned - * @param delim the field delimiter character - */ - public StrTokenizer(final char[] input, final char delim) { - this(input); - setDelimiterChar(delim); - } - - /** - * Constructs a tokenizer splitting on the specified string. - * - * @param input the string which is to be parsed, not cloned - * @param delim the field delimiter string - */ - public StrTokenizer(final char[] input, final String delim) { - this(input); - setDelimiterString(delim); - } - - /** - * Constructs a tokenizer splitting using the specified delimiter matcher. - * - * @param input the string which is to be parsed, not cloned - * @param delim the field delimiter matcher - */ - public StrTokenizer(final char[] input, final StrMatcher delim) { - this(input); - setDelimiterMatcher(delim); - } - - /** - * Constructs a tokenizer splitting on the specified delimiter character - * and handling quotes using the specified quote character. - * - * @param input the string which is to be parsed, not cloned - * @param delim the field delimiter character - * @param quote the field quoted string character - */ - public StrTokenizer(final char[] input, final char delim, final char quote) { - this(input, delim); - setQuoteChar(quote); - } - - /** - * Constructs a tokenizer splitting using the specified delimiter matcher - * and handling quotes using the specified quote matcher. - * - * @param input the string which is to be parsed, not cloned - * @param delim the field delimiter character - * @param quote the field quoted string character - */ - public StrTokenizer(final char[] input, final StrMatcher delim, final StrMatcher quote) { - this(input, delim); - setQuoteMatcher(quote); - } - - // API - //----------------------------------------------------------------------- - /** - * Gets the number of tokens found in the String. - * - * @return the number of matched tokens - */ - public int size() { - checkTokenized(); - return tokens.length; - } - - /** - * Gets the next token from the String. - * Equivalent to {@link #next()} except it returns null rather than - * throwing {@link NoSuchElementException} when no tokens remain. - * - * @return the next sequential token, or null when no more tokens are found - */ - public String nextToken() { - if (hasNext()) { - return tokens[tokenPos++]; - } - return null; - } - - /** - * Gets the previous token from the String. - * - * @return the previous sequential token, or null when no more tokens are found - */ - public String previousToken() { - if (hasPrevious()) { - return tokens[--tokenPos]; - } - return null; - } - - /** - * Gets a copy of the full token list as an independent modifiable array. - * - * @return the tokens as a String array - */ - public String[] getTokenArray() { - checkTokenized(); - return tokens.clone(); - } - - /** - * Gets a copy of the full token list as an independent modifiable list. - * - * @return the tokens as a String array - */ - public List<String> getTokenList() { - checkTokenized(); - final List<String> list = new ArrayList<>(tokens.length); - for (final String element : tokens) { - list.add(element); - } - return list; - } - - /** - * Resets this tokenizer, forgetting all parsing and iteration already completed. - * <p> - * This method allows the same tokenizer to be reused for the same String. - * - * @return this, to enable chaining - */ - public StrTokenizer reset() { - tokenPos = 0; - tokens = null; - return this; - } - - /** - * Reset this tokenizer, giving it a new input string to parse. - * In this manner you can re-use a tokenizer with the same settings - * on multiple input lines. - * - * @param input the new string to tokenize, null sets no text to parse - * @return this, to enable chaining - */ - public StrTokenizer reset(final String input) { - reset(); - if (input != null) { - this.chars = input.toCharArray(); - } else { - this.chars = null; - } - return this; - } - - /** - * Reset this tokenizer, giving it a new input string to parse. - * In this manner you can re-use a tokenizer with the same settings - * on multiple input lines. - * - * @param input the new character array to tokenize, not cloned, null sets no text to parse - * @return this, to enable chaining - */ - public StrTokenizer reset(final char[] input) { - reset(); - if (input != null) { - this.chars = input.clone(); - } else { - this.chars = null; - } - return this; - } - - // ListIterator - //----------------------------------------------------------------------- - /** - * Checks whether there are any more tokens. - * - * @return true if there are more tokens - */ - @Override - public boolean hasNext() { - checkTokenized(); - return tokenPos < tokens.length; - } - - /** - * Gets the next token. - * - * @return the next String token - * @throws NoSuchElementException if there are no more elements - */ - @Override - public String next() { - if (hasNext()) { - return tokens[tokenPos++]; - } - throw new NoSuchElementException(); - } - - /** - * Gets the index of the next token to return. - * - * @return the next token index - */ - @Override - public int nextIndex() { - return tokenPos; - } - - /** - * Checks whether there are any previous tokens that can be iterated to. - * - * @return true if there are previous tokens - */ - @Override - public boolean hasPrevious() { - checkTokenized(); - return tokenPos > 0; - } - - /** - * Gets the token previous to the last returned token. - * - * @return the previous token - */ - @Override - public String previous() { - if (hasPrevious()) { - return tokens[--tokenPos]; - } - throw new NoSuchElementException(); - } - - /** - * Gets the index of the previous token. - * - * @return the previous token index - */ - @Override - public int previousIndex() { - return tokenPos - 1; - } - - /** - * Unsupported ListIterator operation. - * - * @throws UnsupportedOperationException always - */ - @Override - public void remove() { - throw new UnsupportedOperationException("remove() is unsupported"); - } - - /** - * Unsupported ListIterator operation. - * @param obj this parameter ignored. - * @throws UnsupportedOperationException always - */ - @Override - public void set(final String obj) { - throw new UnsupportedOperationException("set() is unsupported"); - } - - /** - * Unsupported ListIterator operation. - * @param obj this parameter ignored. - * @throws UnsupportedOperationException always - */ - @Override - public void add(final String obj) { - throw new UnsupportedOperationException("add() is unsupported"); - } - - // Implementation - //----------------------------------------------------------------------- - /** - * Checks if tokenization has been done, and if not then do it. - */ - private void checkTokenized() { - if (tokens == null) { - if (chars == null) { - // still call tokenize as subclass may do some work - final List<String> split = tokenize(null, 0, 0); - tokens = split.toArray(new String[split.size()]); - } else { - final List<String> split = tokenize(chars, 0, chars.length); - tokens = split.toArray(new String[split.size()]); - } - } - } - - /** - * Internal method to performs the tokenization. - * <p> - * Most users of this class do not need to call this method. This method - * will be called automatically by other (public) methods when required. - * <p> - * This method exists to allow subclasses to add code before or after the - * tokenization. For example, a subclass could alter the character array, - * offset or count to be parsed, or call the tokenizer multiple times on - * multiple strings. It is also be possible to filter the results. - * <p> - * <code>StrTokenizer</code> will always pass a zero offset and a count - * equal to the length of the array to this method, however a subclass - * may pass other values, or even an entirely different array. - * - * @param srcChars the character array being tokenized, may be null - * @param offset the start position within the character array, must be valid - * @param count the number of characters to tokenize, must be valid - * @return the modifiable list of String tokens, unmodifiable if null array or zero count - */ - protected List<String> tokenize(final char[] srcChars, final int offset, final int count) { - if (srcChars == null || count == 0) { - return Collections.emptyList(); - } - final StrBuilder buf = new StrBuilder(); - final List<String> tokenList = new ArrayList<>(); - int pos = offset; - - // loop around the entire buffer - while (pos >= 0 && pos < count) { - // find next token - pos = readNextToken(srcChars, pos, count, buf, tokenList); - - // handle case where end of string is a delimiter - if (pos >= count) { - addToken(tokenList, ""); - } - } - return tokenList; - } - - /** - * Adds a token to a list, paying attention to the parameters we've set. - * - * @param list the list to add to - * @param tok the token to add - */ - private void addToken(final List<String> list, String tok) { - if (tok == null || tok.length() == 0) { - if (isIgnoreEmptyTokens()) { - return; - } - if (isEmptyTokenAsNull()) { - tok = null; - } - } - list.add(tok); - } - - /** - * Reads character by character through the String to get the next token. - * - * @param srcChars the character array being tokenized - * @param start the first character of field - * @param len the length of the character array being tokenized - * @param workArea a temporary work area - * @param tokenList the list of parsed tokens - * @return the starting position of the next field (the character - * immediately after the delimiter), or -1 if end of string found - */ - private int readNextToken(final char[] srcChars, int start, final int len, final StrBuilder workArea, final List<String> tokenList) { - // skip all leading whitespace, unless it is the - // field delimiter or the quote character - while (start < len) { - final int removeLen = Math.max( - getIgnoredMatcher().isMatch(srcChars, start, start, len), - getTrimmerMatcher().isMatch(srcChars, start, start, len)); - if (removeLen == 0 || - getDelimiterMatcher().isMatch(srcChars, start, start, len) > 0 || - getQuoteMatcher().isMatch(srcChars, start, start, len) > 0) { - break; - } - start += removeLen; - } - - // handle reaching end - if (start >= len) { - addToken(tokenList, ""); - return -1; - } - - // handle empty token - final int delimLen = getDelimiterMatcher().isMatch(srcChars, start, start, len); - if (delimLen > 0) { - addToken(tokenList, ""); - return start + delimLen; - } - - // handle found token - final int quoteLen = getQuoteMatcher().isMatch(srcChars, start, start, len); - if (quoteLen > 0) { - return readWithQuotes(srcChars, start + quoteLen, len, workArea, tokenList, start, quoteLen); - } - return readWithQuotes(srcChars, start, len, workArea, tokenList, 0, 0); - } - - /** - * Reads a possibly quoted string token. - * - * @param srcChars the character array being tokenized - * @param start the first character of field - * @param len the length of the character array being tokenized - * @param workArea a temporary work area - * @param tokenList the list of parsed tokens - * @param quoteStart the start position of the matched quote, 0 if no quoting - * @param quoteLen the length of the matched quote, 0 if no quoting - * @return the starting position of the next field (the character - * immediately after the delimiter, or if end of string found, - * then the length of string - */ - private int readWithQuotes(final char[] srcChars, final int start, final int len, final StrBuilder workArea, - final List<String> tokenList, final int quoteStart, final int quoteLen) { - // Loop until we've found the end of the quoted - // string or the end of the input - workArea.clear(); - int pos = start; - boolean quoting = quoteLen > 0; - int trimStart = 0; - - while (pos < len) { - // quoting mode can occur several times throughout a string - // we must switch between quoting and non-quoting until we - // encounter a non-quoted delimiter, or end of string - if (quoting) { - // In quoting mode - - // If we've found a quote character, see if it's - // followed by a second quote. If so, then we need - // to actually put the quote character into the token - // rather than end the token. - if (isQuote(srcChars, pos, len, quoteStart, quoteLen)) { - if (isQuote(srcChars, pos + quoteLen, len, quoteStart, quoteLen)) { - // matched pair of quotes, thus an escaped quote - workArea.append(srcChars, pos, quoteLen); - pos += quoteLen * 2; - trimStart = workArea.size(); - continue; - } - - // end of quoting - quoting = false; - pos += quoteLen; - continue; - } - - // copy regular character from inside quotes - workArea.append(srcChars[pos++]); - trimStart = workArea.size(); - - } else { - // Not in quoting mode - - // check for delimiter, and thus end of token - final int delimLen = getDelimiterMatcher().isMatch(srcChars, pos, start, len); - if (delimLen > 0) { - // return condition when end of token found - addToken(tokenList, workArea.substring(0, trimStart)); - return pos + delimLen; - } - - // check for quote, and thus back into quoting mode - if (quoteLen > 0 && isQuote(srcChars, pos, len, quoteStart, quoteLen)) { - quoting = true; - pos += quoteLen; - continue; - } - - // check for ignored (outside quotes), and ignore - final int ignoredLen = getIgnoredMatcher().isMatch(srcChars, pos, start, len); - if (ignoredLen > 0) { - pos += ignoredLen; - continue; - } - - // check for trimmed character - // don't yet know if its at the end, so copy to workArea - // use trimStart to keep track of trim at the end - final int trimmedLen = getTrimmerMatcher().isMatch(srcChars, pos, start, len); - if (trimmedLen > 0) { - workArea.append(srcChars, pos, trimmedLen); - pos += trimmedLen; - continue; - } - - // copy regular character from outside quotes - workArea.append(srcChars[pos++]); - trimStart = workArea.size(); - } - } - - // return condition when end of string found - addToken(tokenList, workArea.substring(0, trimStart)); - return -1; - } - - /** - * Checks if the characters at the index specified match the quote - * already matched in readNextToken(). - * - * @param srcChars the character array being tokenized - * @param pos the position to check for a quote - * @param len the length of the character array being tokenized - * @param quoteStart the start position of the matched quote, 0 if no quoting - * @param quoteLen the length of the matched quote, 0 if no quoting - * @return true if a quote is matched - */ - private boolean isQuote(final char[] srcChars, final int pos, final int len, final int quoteStart, final int quoteLen) { - for (int i = 0; i < quoteLen; i++) { - if (pos + i >= len || srcChars[pos + i] != srcChars[quoteStart + i]) { - return false; - } - } - return true; - } - - // Delimiter - //----------------------------------------------------------------------- - /** - * Gets the field delimiter matcher. - * - * @return the delimiter matcher in use - */ - public StrMatcher getDelimiterMatcher() { - return this.delimMatcher; - } - - /** - * Sets the field delimiter matcher. - * <p> - * The delimitier is used to separate one token from another. - * - * @param delim the delimiter matcher to use - * @return this, to enable chaining - */ - public StrTokenizer setDelimiterMatcher(final StrMatcher delim) { - if (delim == null) { - this.delimMatcher = StrMatcher.noneMatcher(); - } else { - this.delimMatcher = delim; - } - return this; - } - - /** - * Sets the field delimiter character. - * - * @param delim the delimiter character to use - * @return this, to enable chaining - */ - public StrTokenizer setDelimiterChar(final char delim) { - return setDelimiterMatcher(StrMatcher.charMatcher(delim)); - } - - /** - * Sets the field delimiter string. - * - * @param delim the delimiter string to use - * @return this, to enable chaining - */ - public StrTokenizer setDelimiterString(final String delim) { - return setDelimiterMatcher(StrMatcher.stringMatcher(delim)); - } - - // Quote - //----------------------------------------------------------------------- - /** - * Gets the quote matcher currently in use. - * <p> - * The quote character is used to wrap data between the tokens. - * This enables delimiters to be entered as data. - * The default value is '"' (double quote). - * - * @return the quote matcher in use - */ - public StrMatcher getQuoteMatcher() { - return quoteMatcher; - } - - /** - * Set the quote matcher to use. - * <p> - * The quote character is used to wrap data between the tokens. - * This enables delimiters to be entered as data. - * - * @param quote the quote matcher to use, null ignored - * @return this, to enable chaining - */ - public StrTokenizer setQuoteMatcher(final StrMatcher quote) { - if (quote != null) { - this.quoteMatcher = quote; - } - return this; - } - - /** - * Sets the quote character to use. - * <p> - * The quote character is used to wrap data between the tokens. - * This enables delimiters to be entered as data. - * - * @param quote the quote character to use - * @return this, to enable chaining - */ - public StrTokenizer setQuoteChar(final char quote) { - return setQuoteMatcher(StrMatcher.charMatcher(quote)); - } - - // Ignored - //----------------------------------------------------------------------- - /** - * Gets the ignored character matcher. - * <p> - * These characters are ignored when parsing the String, unless they are - * within a quoted region. - * The default value is not to ignore anything. - * - * @return the ignored matcher in use - */ - public StrMatcher getIgnoredMatcher() { - return ignoredMatcher; - } - - /** - * Set the matcher for characters to ignore. - * <p> - * These characters are ignored when parsing the String, unless they are - * within a quoted region. - * - * @param ignored the ignored matcher to use, null ignored - * @return this, to enable chaining - */ - public StrTokenizer setIgnoredMatcher(final StrMatcher ignored) { - if (ignored != null) { - this.ignoredMatcher = ignored; - } - return this; - } - - /** - * Set the character to ignore. - * <p> - * This character is ignored when parsing the String, unless it is - * within a quoted region. - * - * @param ignored the ignored character to use - * @return this, to enable chaining - */ - public StrTokenizer setIgnoredChar(final char ignored) { - return setIgnoredMatcher(StrMatcher.charMatcher(ignored)); - } - - // Trimmer - //----------------------------------------------------------------------- - /** - * Gets the trimmer character matcher. - * <p> - * These characters are trimmed off on each side of the delimiter - * until the token or quote is found. - * The default value is not to trim anything. - * - * @return the trimmer matcher in use - */ - public StrMatcher getTrimmerMatcher() { - return trimmerMatcher; - } - - /** - * Sets the matcher for characters to trim. - * <p> - * These characters are trimmed off on each side of the delimiter - * until the token or quote is found. - * - * @param trimmer the trimmer matcher to use, null ignored - * @return this, to enable chaining - */ - public StrTokenizer setTrimmerMatcher(final StrMatcher trimmer) { - if (trimmer != null) { - this.trimmerMatcher = trimmer; - } - return this; - } - - //----------------------------------------------------------------------- - /** - * Gets whether the tokenizer currently returns empty tokens as null. - * The default for this property is false. - * - * @return true if empty tokens are returned as null - */ - public boolean isEmptyTokenAsNull() { - return this.emptyAsNull; - } - - /** - * Sets whether the tokenizer should return empty tokens as null. - * The default for this property is false. - * - * @param emptyAsNull whether empty tokens are returned as null - * @return this, to enable chaining - */ - public StrTokenizer setEmptyTokenAsNull(final boolean emptyAsNull) { - this.emptyAsNull = emptyAsNull; - return this; - } - - //----------------------------------------------------------------------- - /** - * Gets whether the tokenizer currently ignores empty tokens. - * The default for this property is true. - * - * @return true if empty tokens are not returned - */ - public boolean isIgnoreEmptyTokens() { - return ignoreEmptyTokens; - } - - /** - * Sets whether the tokenizer should ignore and not return empty tokens. - * The default for this property is true. - * - * @param ignoreEmptyTokens whether empty tokens are not returned - * @return this, to enable chaining - */ - public StrTokenizer setIgnoreEmptyTokens(final boolean ignoreEmptyTokens) { - this.ignoreEmptyTokens = ignoreEmptyTokens; - return this; - } - - //----------------------------------------------------------------------- - /** - * Gets the String content that the tokenizer is parsing. - * - * @return the string content being parsed - */ - public String getContent() { - if (chars == null) { - return null; - } - return new String(chars); - } - - //----------------------------------------------------------------------- - /** - * Creates a new instance of this Tokenizer. The new instance is reset so - * that it will be at the start of the token list. - * If a {@link CloneNotSupportedException} is caught, return <code>null</code>. - * - * @return a new instance of this Tokenizer which has been reset. - */ - @Override - public Object clone() { - try { - return cloneReset(); - } catch (final CloneNotSupportedException ex) { - return null; - } - } - - /** - * Creates a new instance of this Tokenizer. The new instance is reset so that - * it will be at the start of the token list. - * - * @return a new instance of this Tokenizer which has been reset. - * @throws CloneNotSupportedException if there is a problem cloning - */ - Object cloneReset() throws CloneNotSupportedException { - // this method exists to enable 100% test coverage - final StrTokenizer cloned = (StrTokenizer) super.clone(); - if (cloned.chars != null) { - cloned.chars = cloned.chars.clone(); - } - cloned.reset(); - return cloned; - } - - //----------------------------------------------------------------------- - /** - * Gets the String content that the tokenizer is parsing. - * - * @return the string content being parsed - */ - @Override - public String toString() { - if (tokens == null) { - return "StrTokenizer[not tokenized yet]"; - } - return "StrTokenizer" + getTokenList(); - } - -}
http://git-wip-us.apache.org/repos/asf/commons-text/blob/c7cf533d/src/main/java/org/apache/commons/text/beta/StringEscapeUtils.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/text/beta/StringEscapeUtils.java b/src/main/java/org/apache/commons/text/beta/StringEscapeUtils.java deleted file mode 100644 index d6f8ded..0000000 --- a/src/main/java/org/apache/commons/text/beta/StringEscapeUtils.java +++ /dev/null @@ -1,959 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.text.beta; - -import org.apache.commons.lang3.StringUtils; -import org.apache.commons.text.beta.translate.AggregateTranslator; -import org.apache.commons.text.beta.translate.CharSequenceTranslator; -import org.apache.commons.text.beta.translate.CsvTranslators; -import org.apache.commons.text.beta.translate.EntityArrays; -import org.apache.commons.text.beta.translate.JavaUnicodeEscaper; -import org.apache.commons.text.beta.translate.LookupTranslator; -import org.apache.commons.text.beta.translate.NumericEntityEscaper; -import org.apache.commons.text.beta.translate.NumericEntityUnescaper; -import org.apache.commons.text.beta.translate.OctalUnescaper; -import org.apache.commons.text.beta.translate.SingleLookupTranslator; -import org.apache.commons.text.beta.translate.UnicodeUnescaper; -import org.apache.commons.text.beta.translate.UnicodeUnpairedSurrogateRemover; - -import java.io.IOException; -import java.io.Writer; -import java.util.Collections; -import java.util.HashMap; -import java.util.Map; - -/** - * <p>Escapes and unescapes {@code String}s for - * Java, Java Script, HTML and XML.</p> - * - * <p>#ThreadSafe#</p> - * - * - * <p> - * This code has been adapted from Apache Commons Lang 3.5. - * </p> - * - * @since 1.0 - */ -public class StringEscapeUtils { - - /* ESCAPE TRANSLATORS */ - - /** - * Translator object for escaping Java. - * - * While {@link #escapeJava(String)} is the expected method of use, this - * object allows the Java escaping functionality to be used - * as the foundation for a custom translator. - */ - public static final CharSequenceTranslator ESCAPE_JAVA; - static { - Map<CharSequence, CharSequence> escapeJavaMap = new HashMap<>(); - escapeJavaMap.put("\"", "\\\""); - escapeJavaMap.put("\\", "\\\\"); - ESCAPE_JAVA = new AggregateTranslator( - new LookupTranslator(Collections.unmodifiableMap(escapeJavaMap)), - new LookupTranslator(EntityArrays.JAVA_CTRL_CHARS_ESCAPE), - JavaUnicodeEscaper.outsideOf(32, 0x7f) - ); - } - - /** - * Translator object for escaping EcmaScript/JavaScript. - * - * While {@link #escapeEcmaScript(String)} is the expected method of use, this - * object allows the EcmaScript escaping functionality to be used - * as the foundation for a custom translator. - */ - public static final CharSequenceTranslator ESCAPE_ECMASCRIPT; - static { - Map<CharSequence, CharSequence> escapeEcmaScriptMap = new HashMap<>(); - escapeEcmaScriptMap.put("'", "\\'"); - escapeEcmaScriptMap.put("\"", "\\\""); - escapeEcmaScriptMap.put("\\", "\\\\"); - escapeEcmaScriptMap.put("/", "\\/"); - ESCAPE_ECMASCRIPT = new AggregateTranslator( - new LookupTranslator(Collections.unmodifiableMap(escapeEcmaScriptMap)), - new LookupTranslator(EntityArrays.JAVA_CTRL_CHARS_ESCAPE), - JavaUnicodeEscaper.outsideOf(32, 0x7f) - ); - } - - /** - * Translator object for escaping Json. - * - * While {@link #escapeJson(String)} is the expected method of use, this - * object allows the Json escaping functionality to be used - * as the foundation for a custom translator. - */ - public static final CharSequenceTranslator ESCAPE_JSON; - static { - Map<CharSequence, CharSequence> escapeJsonMap = new HashMap<>(); - escapeJsonMap.put("\"", "\\\""); - escapeJsonMap.put("\\", "\\\\"); - escapeJsonMap.put("/", "\\/"); - ESCAPE_JSON = new AggregateTranslator( - new LookupTranslator(Collections.unmodifiableMap(escapeJsonMap)), - new LookupTranslator(EntityArrays.JAVA_CTRL_CHARS_ESCAPE), - JavaUnicodeEscaper.outsideOf(32, 0x7f) - ); - } - - /** - * Translator object for escaping XML 1.0. - * - * While {@link #escapeXml10(String)} is the expected method of use, this - * object allows the XML escaping functionality to be used - * as the foundation for a custom translator. - */ - public static final CharSequenceTranslator ESCAPE_XML10; - static { - Map<CharSequence, CharSequence> escapeXml10Map = new HashMap<>(); - escapeXml10Map.put("\u0000", StringUtils.EMPTY); - escapeXml10Map.put("\u0001", StringUtils.EMPTY); - escapeXml10Map.put("\u0002", StringUtils.EMPTY); - escapeXml10Map.put("\u0003", StringUtils.EMPTY); - escapeXml10Map.put("\u0004", StringUtils.EMPTY); - escapeXml10Map.put("\u0005", StringUtils.EMPTY); - escapeXml10Map.put("\u0006", StringUtils.EMPTY); - escapeXml10Map.put("\u0007", StringUtils.EMPTY); - escapeXml10Map.put("\u0008", StringUtils.EMPTY); - escapeXml10Map.put("\u000b", StringUtils.EMPTY); - escapeXml10Map.put("\u000c", StringUtils.EMPTY); - escapeXml10Map.put("\u000e", StringUtils.EMPTY); - escapeXml10Map.put("\u000f", StringUtils.EMPTY); - escapeXml10Map.put("\u0010", StringUtils.EMPTY); - escapeXml10Map.put("\u0011", StringUtils.EMPTY); - escapeXml10Map.put("\u0012", StringUtils.EMPTY); - escapeXml10Map.put("\u0013", StringUtils.EMPTY); - escapeXml10Map.put("\u0014", StringUtils.EMPTY); - escapeXml10Map.put("\u0015", StringUtils.EMPTY); - escapeXml10Map.put("\u0016", StringUtils.EMPTY); - escapeXml10Map.put("\u0017", StringUtils.EMPTY); - escapeXml10Map.put("\u0018", StringUtils.EMPTY); - escapeXml10Map.put("\u0019", StringUtils.EMPTY); - escapeXml10Map.put("\u001a", StringUtils.EMPTY); - escapeXml10Map.put("\u001b", StringUtils.EMPTY); - escapeXml10Map.put("\u001c", StringUtils.EMPTY); - escapeXml10Map.put("\u001d", StringUtils.EMPTY); - escapeXml10Map.put("\u001e", StringUtils.EMPTY); - escapeXml10Map.put("\u001f", StringUtils.EMPTY); - escapeXml10Map.put("\ufffe", StringUtils.EMPTY); - escapeXml10Map.put("\uffff", StringUtils.EMPTY); - ESCAPE_XML10 = new AggregateTranslator( - new LookupTranslator(EntityArrays.BASIC_ESCAPE), - new LookupTranslator(EntityArrays.APOS_ESCAPE), - new LookupTranslator(Collections.unmodifiableMap(escapeXml10Map)), - NumericEntityEscaper.between(0x7f, 0x84), - NumericEntityEscaper.between(0x86, 0x9f), - new UnicodeUnpairedSurrogateRemover() - ); - } - - /** - * Translator object for escaping XML 1.1. - * - * While {@link #escapeXml11(String)} is the expected method of use, this - * object allows the XML escaping functionality to be used - * as the foundation for a custom translator. - */ - public static final CharSequenceTranslator ESCAPE_XML11; - static { - Map<CharSequence, CharSequence> escapeXml11Map = new HashMap<>(); - escapeXml11Map.put("\u0000", StringUtils.EMPTY); - escapeXml11Map.put("\u000b", ""); - escapeXml11Map.put("\u000c", ""); - escapeXml11Map.put("\ufffe", StringUtils.EMPTY); - escapeXml11Map.put("\uffff", StringUtils.EMPTY); - ESCAPE_XML11 = new AggregateTranslator( - new LookupTranslator(EntityArrays.BASIC_ESCAPE), - new LookupTranslator(EntityArrays.APOS_ESCAPE), - new LookupTranslator(Collections.unmodifiableMap(escapeXml11Map)), - NumericEntityEscaper.between(0x1, 0x8), - NumericEntityEscaper.between(0xe, 0x1f), - NumericEntityEscaper.between(0x7f, 0x84), - NumericEntityEscaper.between(0x86, 0x9f), - new UnicodeUnpairedSurrogateRemover() - ); - } - - /** - * Translator object for escaping HTML version 3.0. - * - * While {@link #escapeHtml3(String)} is the expected method of use, this - * object allows the HTML escaping functionality to be used - * as the foundation for a custom translator. - */ - public static final CharSequenceTranslator ESCAPE_HTML3 = - new AggregateTranslator( - new LookupTranslator(EntityArrays.BASIC_ESCAPE), - new LookupTranslator(EntityArrays.ISO8859_1_ESCAPE) - ); - - /** - * The improved translator object for escaping HTML version 3.0. - * The 'improved' part of this translator is that it checks if the html is already translated. - * This check prevents double, triple, or recursive translations. - * - * While {@link #escapeHtml3Once(String)} is the expected method of use, this - * object allows the HTML escaping functionality to be used - * as the foundation for a custom translator. - * - * Note that, multiple lookup tables should be passed to this translator - * instead of passing multiple instances of this translator to the - * AggregateTranslator. Because, a SingleLookupTranslator only checks the values of the - * lookup table passed to that instance while deciding whether a value is - * already translated or not. - */ - public static final CharSequenceTranslator ESCAPE_HTML3_ONCE = - new SingleLookupTranslator(EntityArrays.BASIC_ESCAPE, EntityArrays.ISO8859_1_ESCAPE); - - - /** - * Translator object for escaping HTML version 4.0. - * - * While {@link #escapeHtml4(String)} is the expected method of use, this - * object allows the HTML escaping functionality to be used - * as the foundation for a custom translator. - */ - public static final CharSequenceTranslator ESCAPE_HTML4 = - new AggregateTranslator( - new LookupTranslator(EntityArrays.BASIC_ESCAPE), - new LookupTranslator(EntityArrays.ISO8859_1_ESCAPE), - new LookupTranslator(EntityArrays.HTML40_EXTENDED_ESCAPE) - ); - - /** - * The improved translator object for escaping HTML version 4.0. - * The 'improved' part of this translator is that it checks if the html is already translated. - * This check prevents double, triple, or recursive translations. - * - * While {@link #escapeHtml4Once(String)} is the expected method of use, this - * object allows the HTML escaping functionality to be used - * as the foundation for a custom translator. - * - * Note that, multiple lookup tables should be passed to this translator - * instead of passing multiple instances of this translator to the - * AggregateTranslator. Because, a SingleLookupTranslator only checks the values of the - * lookup table passed to that instance while deciding whether a value is - * already translated or not. - */ - public static final CharSequenceTranslator ESCAPE_HTML4_ONCE = - new SingleLookupTranslator( - EntityArrays.BASIC_ESCAPE, - EntityArrays.ISO8859_1_ESCAPE, - EntityArrays.HTML40_EXTENDED_ESCAPE - ); - - /** - * Translator object for escaping individual Comma Separated Values. - * - * While {@link #escapeCsv(String)} is the expected method of use, this - * object allows the CSV escaping functionality to be used - * as the foundation for a custom translator. - */ - public static final CharSequenceTranslator ESCAPE_CSV = new CsvTranslators.CsvEscaper(); - - /** - * Translator object for escaping Shell command language. - * - * @see <a href="http://pubs.opengroup.org/onlinepubs/7908799/xcu/chap2.html">Shell Command Language</a> - */ - public static final CharSequenceTranslator ESCAPE_XSI; - static { - Map<CharSequence, CharSequence> escapeXsiMap = new HashMap<>(); - escapeXsiMap.put("|", "\\|"); - escapeXsiMap.put("&", "\\&"); - escapeXsiMap.put(";", "\\;"); - escapeXsiMap.put("<", "\\<"); - escapeXsiMap.put(">", "\\>"); - escapeXsiMap.put("(", "\\("); - escapeXsiMap.put(")", "\\)"); - escapeXsiMap.put("$", "\\$"); - escapeXsiMap.put("`", "\\`"); - escapeXsiMap.put("\\", "\\\\"); - escapeXsiMap.put("\"", "\\\""); - escapeXsiMap.put("'", "\\'"); - escapeXsiMap.put(" ", "\\ "); - escapeXsiMap.put("\t", "\\\t"); - escapeXsiMap.put("\r\n", ""); - escapeXsiMap.put("\n", ""); - escapeXsiMap.put("*", "\\*"); - escapeXsiMap.put("?", "\\?"); - escapeXsiMap.put("[", "\\["); - escapeXsiMap.put("#", "\\#"); - escapeXsiMap.put("~", "\\~"); - escapeXsiMap.put("=", "\\="); - escapeXsiMap.put("%", "\\%"); - ESCAPE_XSI = new LookupTranslator( - Collections.unmodifiableMap(escapeXsiMap) - ); - } - - /* UNESCAPE TRANSLATORS */ - - /** - * Translator object for unescaping escaped Java. - * - * While {@link #unescapeJava(String)} is the expected method of use, this - * object allows the Java unescaping functionality to be used - * as the foundation for a custom translator. - */ - // TODO: throw "illegal character: \92" as an Exception if a \ on the end of the Java (as per the compiler)? - public static final CharSequenceTranslator UNESCAPE_JAVA; - static { - Map<CharSequence, CharSequence> unescapeJavaMap = new HashMap<>(); - unescapeJavaMap.put("\\\\", "\\"); - unescapeJavaMap.put("\\\"", "\""); - unescapeJavaMap.put("\\'", "'"); - unescapeJavaMap.put("\\", ""); - UNESCAPE_JAVA = new AggregateTranslator( - new OctalUnescaper(), // .between('\1', '\377'), - new UnicodeUnescaper(), - new LookupTranslator(EntityArrays.JAVA_CTRL_CHARS_UNESCAPE), - new LookupTranslator(Collections.unmodifiableMap(unescapeJavaMap)) - ); - } - - /** - * Translator object for unescaping escaped EcmaScript. - * - * While {@link #unescapeEcmaScript(String)} is the expected method of use, this - * object allows the EcmaScript unescaping functionality to be used - * as the foundation for a custom translator. - */ - public static final CharSequenceTranslator UNESCAPE_ECMASCRIPT = UNESCAPE_JAVA; - - /** - * Translator object for unescaping escaped Json. - * - * While {@link #unescapeJson(String)} is the expected method of use, this - * object allows the Json unescaping functionality to be used - * as the foundation for a custom translator. - */ - public static final CharSequenceTranslator UNESCAPE_JSON = UNESCAPE_JAVA; - - /** - * Translator object for unescaping escaped HTML 3.0. - * - * While {@link #unescapeHtml3(String)} is the expected method of use, this - * object allows the HTML unescaping functionality to be used - * as the foundation for a custom translator. - */ - public static final CharSequenceTranslator UNESCAPE_HTML3 = - new AggregateTranslator( - new LookupTranslator(EntityArrays.BASIC_UNESCAPE), - new LookupTranslator(EntityArrays.ISO8859_1_UNESCAPE), - new NumericEntityUnescaper() - ); - - /** - * Translator object for unescaping escaped HTML 4.0. - * - * While {@link #unescapeHtml4(String)} is the expected method of use, this - * object allows the HTML unescaping functionality to be used - * as the foundation for a custom translator. - */ - public static final CharSequenceTranslator UNESCAPE_HTML4 = - new AggregateTranslator( - new LookupTranslator(EntityArrays.BASIC_UNESCAPE), - new LookupTranslator(EntityArrays.ISO8859_1_UNESCAPE), - new LookupTranslator(EntityArrays.HTML40_EXTENDED_UNESCAPE), - new NumericEntityUnescaper() - ); - - /** - * Translator object for unescaping escaped XML. - * - * While {@link #unescapeXml(String)} is the expected method of use, this - * object allows the XML unescaping functionality to be used - * as the foundation for a custom translator. - */ - public static final CharSequenceTranslator UNESCAPE_XML = - new AggregateTranslator( - new LookupTranslator(EntityArrays.BASIC_UNESCAPE), - new LookupTranslator(EntityArrays.APOS_UNESCAPE), - new NumericEntityUnescaper() - ); - - /** - * Translator object for unescaping escaped Comma Separated Value entries. - * - * While {@link #unescapeCsv(String)} is the expected method of use, this - * object allows the CSV unescaping functionality to be used - * as the foundation for a custom translator. - */ - public static final CharSequenceTranslator UNESCAPE_CSV = new CsvTranslators.CsvUnescaper(); - - /** - * Translator object for unescaping escaped XSI Value entries. - * - * While {@link #unescapeXSI(String)} is the expected method of use, this - * object allows the XSI unescaping functionality to be used - * as the foundation for a custom translator. - */ - public static final CharSequenceTranslator UNESCAPE_XSI = new XsiUnescaper(); - - /** - * Translator object for unescaping backslash escaped entries. - */ - static class XsiUnescaper extends CharSequenceTranslator { - - /** - * Escaped backslash constant. - */ - private static final char BACKSLASH = '\\'; - - @Override - public int translate(final CharSequence input, final int index, final Writer out) throws IOException { - - if (index != 0) { - throw new IllegalStateException("XsiUnescaper should never reach the [1] index"); - } - - String s = input.toString(); - - int segmentStart = 0; - int searchOffset = 0; - while (true) { - int pos = s.indexOf(BACKSLASH, searchOffset); - if (pos == -1) { - if (segmentStart < s.length()) { - out.write(s.substring(segmentStart)); - } - break; - } - if (pos > segmentStart) { - out.write(s.substring(segmentStart, pos)); - } - segmentStart = pos + 1; - searchOffset = pos + 2; - } - - return Character.codePointCount(input, 0, input.length()); - } - } - - /* Helper functions */ - - /** - * <p>{@code StringEscapeUtils} instances should NOT be constructed in - * standard programming.</p> - * - * <p>Instead, the class should be used as:</p> - * <pre>StringEscapeUtils.escapeJava("foo");</pre> - * - * <p>This constructor is public to permit tools that require a JavaBean - * instance to operate.</p> - */ - public StringEscapeUtils() { - super(); - } - - /** - * <p>Convenience wrapper for {@link java.lang.StringBuilder} providing escape methods.</p> - * - * <p>Example:</p> - * <pre> - * new Builder(ESCAPE_HTML4) - * .append("<p>") - * .escape("This is paragraph 1 and special chars like & get escaped.") - * .append("</p><p>") - * .escape("This is paragraph 2 & more...") - * .append("</p>") - * .toString() - * </pre> - * - */ - public static final class Builder { - - /** - * StringBuilder to be used in the Builder class. - */ - private final StringBuilder sb; - - /** - * CharSequenceTranslator to be used in the Builder class. - */ - private final CharSequenceTranslator translator; - - /** - * Builder constructor. - * - * @param translator a CharSequenceTranslator. - */ - private Builder(final CharSequenceTranslator translator) { - this.sb = new StringBuilder(); - this.translator = translator; - } - - /** - * <p>Escape {@code input} according to the given {@link CharSequenceTranslator}.</p> - * - * @param input the String to escape - * @return {@code this}, to enable chaining - */ - public Builder escape(final String input) { - sb.append(translator.translate(input)); - return this; - } - - /** - * Literal append, no escaping being done. - * - * @param input the String to append - * @return {@code this}, to enable chaining - */ - public Builder append(final String input) { - sb.append(input); - return this; - } - - /** - * <p>Return the escaped string.</p> - * - * @return the escaped string - */ - @Override - public String toString() { - return sb.toString(); - } - } - - /** - * Get a {@link Builder}. - * @param translator the text translator - * @return {@link Builder} - */ - public static StringEscapeUtils.Builder builder(final CharSequenceTranslator translator) { - return new Builder(translator); - } - - // Java and JavaScript - //-------------------------------------------------------------------------- - /** - * <p>Escapes the characters in a {@code String} using Java String rules.</p> - * - * <p>Deals correctly with quotes and control-chars (tab, backslash, cr, ff, etc.) </p> - * - * <p>So a tab becomes the characters {@code '\\'} and - * {@code 't'}.</p> - * - * <p>The only difference between Java strings and JavaScript strings - * is that in JavaScript, a single quote and forward-slash (/) are escaped.</p> - * - * <p>Example:</p> - * <pre> - * input string: He didn't say, "Stop!" - * output string: He didn't say, \"Stop!\" - * </pre> - * - * @param input String to escape values in, may be null - * @return String with escaped values, {@code null} if null string input - */ - public static final String escapeJava(final String input) { - return ESCAPE_JAVA.translate(input); - } - - /** - * <p>Escapes the characters in a {@code String} using EcmaScript String rules.</p> - * <p>Escapes any values it finds into their EcmaScript String form. - * Deals correctly with quotes and control-chars (tab, backslash, cr, ff, etc.) </p> - * - * <p>So a tab becomes the characters {@code '\\'} and - * {@code 't'}.</p> - * - * <p>The only difference between Java strings and EcmaScript strings - * is that in EcmaScript, a single quote and forward-slash (/) are escaped.</p> - * - * <p>Note that EcmaScript is best known by the JavaScript and ActionScript dialects. </p> - * - * <p>Example:</p> - * <pre> - * input string: He didn't say, "Stop!" - * output string: He didn\'t say, \"Stop!\" - * </pre> - * - * <b>Security Note.</b> We only provide backslash escaping in this method. For example, {@code '\"'} has the output - * {@code '\\\"'} which could result in potential issues in the case where the string being escaped is being used - * in an HTML tag like {@code <select onmouseover="..." />}. If you wish to have more rigorous string escaping, you - * may consider the - * <a href="https://www.owasp.org/index.php/Category:OWASP_Enterprise_Security_API_JAVA">ESAPI Libraries</a>. - * Further, you can view the <a href="https://github.com/esapi">ESAPI GitHub Org</a>. - * - * @param input String to escape values in, may be null - * @return String with escaped values, {@code null} if null string input - */ - public static final String escapeEcmaScript(final String input) { - return ESCAPE_ECMASCRIPT.translate(input); - } - - /** - * <p>Escapes the characters in a {@code String} using Json String rules.</p> - * <p>Escapes any values it finds into their Json String form. - * Deals correctly with quotes and control-chars (tab, backslash, cr, ff, etc.) </p> - * - * <p>So a tab becomes the characters {@code '\\'} and - * {@code 't'}.</p> - * - * <p>The only difference between Java strings and Json strings - * is that in Json, forward-slash (/) is escaped.</p> - * - * <p>See http://www.ietf.org/rfc/rfc4627.txt for further details. </p> - * - * <p>Example:</p> - * <pre> - * input string: He didn't say, "Stop!" - * output string: He didn't say, \"Stop!\" - * </pre> - * - * @param input String to escape values in, may be null - * @return String with escaped values, {@code null} if null string input - */ - public static final String escapeJson(final String input) { - return ESCAPE_JSON.translate(input); - } - - /** - * <p>Unescapes any Java literals found in the {@code String}. - * For example, it will turn a sequence of {@code '\'} and - * {@code 'n'} into a newline character, unless the {@code '\'} - * is preceded by another {@code '\'}.</p> - * - * @param input the {@code String} to unescape, may be null - * @return a new unescaped {@code String}, {@code null} if null string input - */ - public static final String unescapeJava(final String input) { - return UNESCAPE_JAVA.translate(input); - } - - /** - * <p>Unescapes any EcmaScript literals found in the {@code String}.</p> - * - * <p>For example, it will turn a sequence of {@code '\'} and {@code 'n'} - * into a newline character, unless the {@code '\'} is preceded by another - * {@code '\'}.</p> - * - * @see #unescapeJava(String) - * @param input the {@code String} to unescape, may be null - * @return A new unescaped {@code String}, {@code null} if null string input - */ - public static final String unescapeEcmaScript(final String input) { - return UNESCAPE_ECMASCRIPT.translate(input); - } - - /** - * <p>Unescapes any Json literals found in the {@code String}.</p> - * - * <p>For example, it will turn a sequence of {@code '\'} and {@code 'n'} - * into a newline character, unless the {@code '\'} is preceded by another - * {@code '\'}.</p> - * - * @see #unescapeJava(String) - * @param input the {@code String} to unescape, may be null - * @return A new unescaped {@code String}, {@code null} if null string input - */ - public static final String unescapeJson(final String input) { - return UNESCAPE_JSON.translate(input); - } - - // HTML and XML - //-------------------------------------------------------------------------- - /** - * <p>Escapes the characters in a {@code String} using HTML entities.</p> - * - * <p> - * For example: - * </p> - * <p><code>"bread" & "butter"</code></p> - * becomes: - * <p> - * <code>&quot;bread&quot; &amp; &quot;butter&quot;</code>. - * </p> - * - * <p>Supports all known HTML 4.0 entities, including funky accents. - * Note that the commonly used apostrophe escape character (&apos;) - * is not a legal entity and so is not supported). </p> - * - * @param input the {@code String} to escape, may be null - * @return a new escaped {@code String}, {@code null} if null string input - * - * @see <a href="http://hotwired.lycos.com/webmonkey/reference/special_characters/">ISO Entities</a> - * @see <a href="http://www.w3.org/TR/REC-html32#latin1">HTML 3.2 Character Entities for ISO Latin-1</a> - * @see <a href="http://www.w3.org/TR/REC-html40/sgml/entities.html">HTML 4.0 Character entity references</a> - * @see <a href="http://www.w3.org/TR/html401/charset.html#h-5.3">HTML 4.01 Character References</a> - * @see <a href="http://www.w3.org/TR/html401/charset.html#code-position">HTML 4.01 Code positions</a> - */ - public static final String escapeHtml4(final String input) { - return ESCAPE_HTML4.translate(input); - } - - /** - * <p>Escapes the characters in a {@code String} using HTML entities. - * But escapes them only once. i.e. does not escape already escaped characters.</p> - * - * <p> - * For example: - * </p> - * <p><code>"bread" & "butter"</code></p> - * becomes: - * <p> - * <code>&quot;bread&quot; &amp; &quot;butter&quot;</code>. - * </p> - * - * <p> - * But: - * </p> - * <p><code>&quot;bread&quot; &amp; &quot;butter&quot;</code></p> - * remains unaffected. - * - * <p>Supports all known HTML 4.0 entities, including funky accents. - * Note that the commonly used apostrophe escape character (&apos;) - * is not a legal entity and so is not supported). </p> - * - * @param input the {@code String} to escape, may be null - * @return a new escaped {@code String}, {@code null} if null string input - * - * @see <a href="http://hotwired.lycos.com/webmonkey/reference/special_characters/">ISO Entities</a> - * @see <a href="http://www.w3.org/TR/REC-html32#latin1">HTML 3.2 Character Entities for ISO Latin-1</a> - * @see <a href="http://www.w3.org/TR/REC-html40/sgml/entities.html">HTML 4.0 Character entity references</a> - * @see <a href="http://www.w3.org/TR/html401/charset.html#h-5.3">HTML 4.01 Character References</a> - * @see <a href="http://www.w3.org/TR/html401/charset.html#code-position">HTML 4.01 Code positions</a> - */ - public static final String escapeHtml4Once(final String input) { - return ESCAPE_HTML4_ONCE.translate(input); - } - - - /** - * <p>Escapes the characters in a {@code String} using HTML entities.</p> - * <p>Supports only the HTML 3.0 entities. </p> - * - * @param input the {@code String} to escape, may be null - * @return a new escaped {@code String}, {@code null} if null string input - */ - public static final String escapeHtml3(final String input) { - return ESCAPE_HTML3.translate(input); - } - - /** - * <p>Escapes the characters in a {@code String} using HTML entities. - * But escapes them only once. i.e. does not escape already escaped characters.</p> - * <p>Supports only the HTML 3.0 entities. </p> - * - * @param input the {@code String} to escape, may be null - * @return a new escaped {@code String}, {@code null} if null string input - */ - public static final String escapeHtml3Once(final String input) { - return ESCAPE_HTML3_ONCE.translate(input); - } - - //----------------------------------------------------------------------- - /** - * <p>Unescapes a string containing entity escapes to a string - * containing the actual Unicode characters corresponding to the - * escapes. Supports HTML 4.0 entities.</p> - * - * <p>For example, the string {@code "<Français>"} - * will become {@code "<Fran�ais>"}</p> - * - * <p>If an entity is unrecognized, it is left alone, and inserted - * verbatim into the result string. e.g. {@code ">&zzzz;x"} will - * become {@code ">&zzzz;x"}.</p> - * - * @param input the {@code String} to unescape, may be null - * @return a new unescaped {@code String}, {@code null} if null string input - */ - public static final String unescapeHtml4(final String input) { - return UNESCAPE_HTML4.translate(input); - } - - /** - * <p>Unescapes a string containing entity escapes to a string - * containing the actual Unicode characters corresponding to the - * escapes. Supports only HTML 3.0 entities.</p> - * - * @param input the {@code String} to unescape, may be null - * @return a new unescaped {@code String}, {@code null} if null string input - */ - public static final String unescapeHtml3(final String input) { - return UNESCAPE_HTML3.translate(input); - } - - /** - * <p>Escapes the characters in a {@code String} using XML entities.</p> - * - * <p>For example: {@code "bread" & "butter"} => - * {@code "bread" & "butter"}. - * </p> - * - * <p>Note that XML 1.0 is a text-only format: it cannot represent control - * characters or unpaired Unicode surrogate codepoints, even after escaping. - * {@code escapeXml10} will remove characters that do not fit in the - * following ranges:</p> - * - * <p>{@code #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]}</p> - * - * <p>Though not strictly necessary, {@code escapeXml10} will escape - * characters in the following ranges:</p> - * - * <p>{@code [#x7F-#x84] | [#x86-#x9F]}</p> - * - * <p>The returned string can be inserted into a valid XML 1.0 or XML 1.1 - * document. If you want to allow more non-text characters in an XML 1.1 - * document, use {@link #escapeXml11(String)}.</p> - * - * @param input the {@code String} to escape, may be null - * @return a new escaped {@code String}, {@code null} if null string input - * @see #unescapeXml(java.lang.String) - */ - public static String escapeXml10(final String input) { - return ESCAPE_XML10.translate(input); - } - - /** - * <p>Escapes the characters in a {@code String} using XML entities.</p> - * - * <p>For example: {@code "bread" & "butter"} => - * {@code "bread" & "butter"}. - * </p> - * - * <p>XML 1.1 can represent certain control characters, but it cannot represent - * the null byte or unpaired Unicode surrogate codepoints, even after escaping. - * {@code escapeXml11} will remove characters that do not fit in the following - * ranges:</p> - * - * <p>{@code [#x1-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]}</p> - * - * <p>{@code escapeXml11} will escape characters in the following ranges:</p> - * - * <p>{@code [#x1-#x8] | [#xB-#xC] | [#xE-#x1F] | [#x7F-#x84] | [#x86-#x9F]}</p> - * - * <p>The returned string can be inserted into a valid XML 1.1 document. Do not - * use it for XML 1.0 documents.</p> - * - * @param input the {@code String} to escape, may be null - * @return a new escaped {@code String}, {@code null} if null string input - * @see #unescapeXml(java.lang.String) - */ - public static String escapeXml11(final String input) { - return ESCAPE_XML11.translate(input); - } - - //----------------------------------------------------------------------- - /** - * <p>Unescapes a string containing XML entity escapes to a string - * containing the actual Unicode characters corresponding to the - * escapes.</p> - * - * <p>Supports only the five basic XML entities (gt, lt, quot, amp, apos). - * Does not support DTDs or external entities.</p> - * - * <p>Note that numerical \\u Unicode codes are unescaped to their respective - * Unicode characters. This may change in future releases. </p> - * - * @param input the {@code String} to unescape, may be null - * @return a new unescaped {@code String}, {@code null} if null string input - * @see #escapeXml10(String) - * @see #escapeXml11(String) - */ - public static final String unescapeXml(final String input) { - return UNESCAPE_XML.translate(input); - } - - //----------------------------------------------------------------------- - - /** - * <p>Returns a {@code String} value for a CSV column enclosed in double quotes, - * if required.</p> - * - * <p>If the value contains a comma, newline or double quote, then the - * String value is returned enclosed in double quotes.</p> - * - * <p>Any double quote characters in the value are escaped with another double quote.</p> - * - * <p>If the value does not contain a comma, newline or double quote, then the - * String value is returned unchanged.</p> - * - * see <a href="http://en.wikipedia.org/wiki/Comma-separated_values">Wikipedia</a> and - * <a href="http://tools.ietf.org/html/rfc4180">RFC 4180</a>. - * - * @param input the input CSV column String, may be null - * @return the input String, enclosed in double quotes if the value contains a comma, - * newline or double quote, {@code null} if null string input - */ - public static final String escapeCsv(final String input) { - return ESCAPE_CSV.translate(input); - } - - /** - * <p>Returns a {@code String} value for an unescaped CSV column. </p> - * - * <p>If the value is enclosed in double quotes, and contains a comma, newline - * or double quote, then quotes are removed. - * </p> - * - * <p>Any double quote escaped characters (a pair of double quotes) are unescaped - * to just one double quote. </p> - * - * <p>If the value is not enclosed in double quotes, or is and does not contain a - * comma, newline or double quote, then the String value is returned unchanged.</p> - * - * see <a href="http://en.wikipedia.org/wiki/Comma-separated_values">Wikipedia</a> and - * <a href="http://tools.ietf.org/html/rfc4180">RFC 4180</a>. - * - * @param input the input CSV column String, may be null - * @return the input String, with enclosing double quotes removed and embedded double - * quotes unescaped, {@code null} if null string input - */ - public static final String unescapeCsv(final String input) { - return UNESCAPE_CSV.translate(input); - } - - /** - * <p>Escapes the characters in a {@code String} using XSI rules.</p> - * - * <p><b>Beware!</b> In most cases you don't want to escape shell commands but use multi-argument - * methods provided by {@link java.lang.ProcessBuilder} or {@link java.lang.Runtime#exec(String[])} - * instead.</p> - * - * <p>Example:</p> - * <pre> - * input string: He didn't say, "Stop!" - * output string: He\ didn\'t\ say,\ \"Stop!\" - * </pre> - * - * @see <a href="http://pubs.opengroup.org/onlinepubs/7908799/xcu/chap2.html">Shell Command Language</a> - * @param input String to escape values in, may be null - * @return String with escaped values, {@code null} if null string input - */ - public static final String escapeXSI(final String input) { - return ESCAPE_XSI.translate(input); - } - - /** - * <p>Unescapes the characters in a {@code String} using XSI rules.</p> - * - * @see StringEscapeUtils#escapeXSI(String) - * @param input the {@code String} to unescape, may be null - * @return a new unescaped {@code String}, {@code null} if null string input - */ - public static final String unescapeXSI(final String input) { - return UNESCAPE_XSI.translate(input); - } - -} http://git-wip-us.apache.org/repos/asf/commons-text/blob/c7cf533d/src/main/java/org/apache/commons/text/beta/diff/CommandVisitor.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/text/beta/diff/CommandVisitor.java b/src/main/java/org/apache/commons/text/beta/diff/CommandVisitor.java deleted file mode 100644 index 0fdecbb..0000000 --- a/src/main/java/org/apache/commons/text/beta/diff/CommandVisitor.java +++ /dev/null @@ -1,146 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.text.beta.diff; - -/** - * This interface should be implemented by user object to walk - * through {@link EditScript EditScript} objects. - * <p> - * Users should implement this interface in order to walk through - * the {@link EditScript EditScript} object created by the comparison - * of two sequences. This is a direct application of the visitor - * design pattern. The {@link EditScript#visit EditScript.visit} - * method takes an object implementing this interface as an argument, - * it will perform the loop over all commands in the script and the - * proper methods of the user class will be called as the commands are - * encountered. - * </p> - * <p> - * The implementation of the user visitor class will depend on the - * need. Here are two examples. - * </p> - * <p> - * The first example is a visitor that build the longest common - * subsequence: - * </p> - * <pre> - * import org.apache.commons.text.diff.CommandVisitor; - * - * import java.util.ArrayList; - * - * public class LongestCommonSubSequence implements CommandVisitor { - * - * public LongestCommonSubSequence() { - * a = new ArrayList(); - * } - * - * public void visitInsertCommand(Object object) { - * } - * - * public void visitKeepCommand(Object object) { - * a.add(object); - * } - * - * public void visitDeleteCommand(Object object) { - * } - * - * public Object[] getSubSequence() { - * return a.toArray(); - * } - * - * private ArrayList a; - * - * } - * </pre> - * <p> - * The second example is a visitor that shows the commands and the way - * they transform the first sequence into the second one: - * <pre> - * import org.apache.commons.text.diff.CommandVisitor; - * - * import java.util.Arrays; - * import java.util.ArrayList; - * import java.util.Iterator; - * - * public class ShowVisitor implements CommandVisitor { - * - * public ShowVisitor(Object[] sequence1) { - * v = new ArrayList(); - * v.addAll(Arrays.asList(sequence1)); - * index = 0; - * } - * - * public void visitInsertCommand(Object object) { - * v.insertElementAt(object, index++); - * display("insert", object); - * } - * - * public void visitKeepCommand(Object object) { - * ++index; - * display("keep ", object); - * } - * - * public void visitDeleteCommand(Object object) { - * v.remove(index); - * display("delete", object); - * } - * - * private void display(String commandName, Object object) { - * System.out.println(commandName + " " + object + ": " + this); - * } - * - * public String toString() { - * StringBuffer buffer = new StringBuffer(); - * for (Iterator iter = v.iterator(); iter.hasNext();) { - * buffer.append(' ').append(iter.next()); - * } - * return buffer.toString(); - * } - * - * private ArrayList v; - * private int index; - * - * } - * </pre> - * - * @param <T> object type - * @since 1.0 - */ -public interface CommandVisitor<T> { - - /** - * Method called when an insert command is encountered. - * - * @param object object to insert (this object comes from the second sequence) - */ - void visitInsertCommand(T object); - - /** - * Method called when a keep command is encountered. - * - * @param object object to keep (this object comes from the first sequence) - */ - void visitKeepCommand(T object); - - /** - * Method called when a delete command is encountered. - * - * @param object object to delete (this object comes from the first sequence) - */ - void visitDeleteCommand(T object); - -} http://git-wip-us.apache.org/repos/asf/commons-text/blob/c7cf533d/src/main/java/org/apache/commons/text/beta/diff/DeleteCommand.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/text/beta/diff/DeleteCommand.java b/src/main/java/org/apache/commons/text/beta/diff/DeleteCommand.java deleted file mode 100644 index 71bd418..0000000 --- a/src/main/java/org/apache/commons/text/beta/diff/DeleteCommand.java +++ /dev/null @@ -1,56 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.text.beta.diff; - -/** - * Command representing the deletion of one object of the first sequence. - * <p> - * When one object of the first sequence has no corresponding object in the - * second sequence at the right place, the {@link EditScript edit script} - * transforming the first sequence into the second sequence uses an instance of - * this class to represent the deletion of this object. The objects embedded in - * these type of commands always come from the first sequence. - * </p> - * - * @see StringsComparator - * @see EditScript - * - * @param <T> object type - * @since 1.0 - */ -public class DeleteCommand<T> extends EditCommand<T> { - - /** - * Simple constructor. Creates a new instance of {@link DeleteCommand}. - * - * @param object the object of the first sequence that should be deleted - */ - public DeleteCommand(final T object) { - super(object); - } - - /** - * Accept a visitor. When a <code>DeleteCommand</code> accepts a visitor, it calls - * its {@link CommandVisitor#visitDeleteCommand visitDeleteCommand} method. - * - * @param visitor the visitor to be accepted - */ - @Override - public void accept(final CommandVisitor<T> visitor) { - visitor.visitDeleteCommand(getObject()); - } -}