http://git-wip-us.apache.org/repos/asf/commons-text/blob/6f24aa45/src/main/java/org/apache/commons/text/StrTokenizer.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/text/StrTokenizer.java b/src/main/java/org/apache/commons/text/StrTokenizer.java new file mode 100644 index 0000000..a980bf9 --- /dev/null +++ b/src/main/java/org/apache/commons/text/StrTokenizer.java @@ -0,0 +1,1116 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.text; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.ListIterator; +import java.util.NoSuchElementException; + +/** + * Tokenizes a string based based on delimiters (separators) + * and supporting quoting and ignored character concepts. + * <p> + * This class can split a String into many smaller strings. It aims + * to do a similar job to {@link java.util.StringTokenizer StringTokenizer}, + * however it offers much more control and flexibility including implementing + * the <code>ListIterator</code> interface. By default, it is set up + * like <code>StringTokenizer</code>. + * <p> + * The input String is split into a number of <i>tokens</i>. + * Each token is separated from the next String by a <i>delimiter</i>. + * One or more delimiter characters must be specified. + * <p> + * Each token may be surrounded by quotes. + * The <i>quote</i> matcher specifies the quote character(s). + * A quote may be escaped within a quoted section by duplicating itself. + * <p> + * Between each token and the delimiter are potentially characters that need trimming. + * The <i>trimmer</i> matcher specifies these characters. + * One usage might be to trim whitespace characters. + * <p> + * At any point outside the quotes there might potentially be invalid characters. + * The <i>ignored</i> matcher specifies these characters to be removed. + * One usage might be to remove new line characters. + * <p> + * Empty tokens may be removed or returned as null. + * <pre> + * "a,b,c" - Three tokens "a","b","c" (comma delimiter) + * " a, b , c " - Three tokens "a","b","c" (default CSV processing trims whitespace) + * "a, ", b ,", c" - Three tokens "a, " , " b ", ", c" (quoted text untouched) + * </pre> + * <p> + * + * This tokenizer has the following properties and options: + * + * <table summary="Tokenizer Properties"> + * <tr> + * <th>Property</th><th>Type</th><th>Default</th> + * </tr> + * <tr> + * <td>delim</td><td>CharSetMatcher</td><td>{ \t\n\r\f}</td> + * </tr> + * <tr> + * <td>quote</td><td>NoneMatcher</td><td>{}</td> + * </tr> + * <tr> + * <td>ignore</td><td>NoneMatcher</td><td>{}</td> + * </tr> + * <tr> + * <td>emptyTokenAsNull</td><td>boolean</td><td>false</td> + * </tr> + * <tr> + * <td>ignoreEmptyTokens</td><td>boolean</td><td>true</td> + * </tr> + * </table> + */ +public class StrTokenizer implements ListIterator<String>, Cloneable { + + private static final StrTokenizer CSV_TOKENIZER_PROTOTYPE; + private static final StrTokenizer TSV_TOKENIZER_PROTOTYPE; + static { + CSV_TOKENIZER_PROTOTYPE = new StrTokenizer(); + CSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StrMatcher.commaMatcher()); + CSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StrMatcher.doubleQuoteMatcher()); + CSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StrMatcher.noneMatcher()); + CSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StrMatcher.trimMatcher()); + CSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false); + CSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false); + + TSV_TOKENIZER_PROTOTYPE = new StrTokenizer(); + TSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StrMatcher.tabMatcher()); + TSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StrMatcher.doubleQuoteMatcher()); + TSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StrMatcher.noneMatcher()); + TSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StrMatcher.trimMatcher()); + TSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false); + TSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false); + } + + /** The text to work on. */ + private char chars[]; + /** The parsed tokens */ + private String tokens[]; + /** The current iteration position */ + private int tokenPos; + + /** The delimiter matcher */ + private StrMatcher delimMatcher = StrMatcher.splitMatcher(); + /** The quote matcher */ + private StrMatcher quoteMatcher = StrMatcher.noneMatcher(); + /** The ignored matcher */ + private StrMatcher ignoredMatcher = StrMatcher.noneMatcher(); + /** The trimmer matcher */ + private StrMatcher trimmerMatcher = StrMatcher.noneMatcher(); + + /** Whether to return empty tokens as null */ + private boolean emptyAsNull = false; + /** Whether to ignore empty tokens */ + private boolean ignoreEmptyTokens = true; + + //----------------------------------------------------------------------- + + /** + * Returns a clone of <code>CSV_TOKENIZER_PROTOTYPE</code>. + * + * @return a clone of <code>CSV_TOKENIZER_PROTOTYPE</code>. + */ + private static StrTokenizer getCSVClone() { + return (StrTokenizer) CSV_TOKENIZER_PROTOTYPE.clone(); + } + + /** + * Gets a new tokenizer instance which parses Comma Separated Value strings + * initializing it with the given input. The default for CSV processing + * will be trim whitespace from both ends (which can be overridden with + * the setTrimmer method). + * <p> + * You must call a "reset" method to set the string which you want to parse. + * @return a new tokenizer instance which parses Comma Separated Value strings + */ + public static StrTokenizer getCSVInstance() { + return getCSVClone(); + } + + /** + * Gets a new tokenizer instance which parses Comma Separated Value strings + * initializing it with the given input. The default for CSV processing + * will be trim whitespace from both ends (which can be overridden with + * the setTrimmer method). + * + * @param input the text to parse + * @return a new tokenizer instance which parses Comma Separated Value strings + */ + public static StrTokenizer getCSVInstance(final String input) { + final StrTokenizer tok = getCSVClone(); + tok.reset(input); + return tok; + } + + /** + * Gets a new tokenizer instance which parses Comma Separated Value strings + * initializing it with the given input. The default for CSV processing + * will be trim whitespace from both ends (which can be overridden with + * the setTrimmer method). + * + * @param input the text to parse + * @return a new tokenizer instance which parses Comma Separated Value strings + */ + public static StrTokenizer getCSVInstance(final char[] input) { + final StrTokenizer tok = getCSVClone(); + tok.reset(input); + return tok; + } + + /** + * Returns a clone of <code>TSV_TOKENIZER_PROTOTYPE</code>. + * + * @return a clone of <code>TSV_TOKENIZER_PROTOTYPE</code>. + */ + private static StrTokenizer getTSVClone() { + return (StrTokenizer) TSV_TOKENIZER_PROTOTYPE.clone(); + } + + + /** + * Gets a new tokenizer instance which parses Tab Separated Value strings. + * The default for CSV processing will be trim whitespace from both ends + * (which can be overridden with the setTrimmer method). + * <p> + * You must call a "reset" method to set the string which you want to parse. + * @return a new tokenizer instance which parses Tab Separated Value strings. + */ + public static StrTokenizer getTSVInstance() { + return getTSVClone(); + } + + /** + * Gets a new tokenizer instance which parses Tab Separated Value strings. + * The default for CSV processing will be trim whitespace from both ends + * (which can be overridden with the setTrimmer method). + * @param input the string to parse + * @return a new tokenizer instance which parses Tab Separated Value strings. + */ + public static StrTokenizer getTSVInstance(final String input) { + final StrTokenizer tok = getTSVClone(); + tok.reset(input); + return tok; + } + + /** + * Gets a new tokenizer instance which parses Tab Separated Value strings. + * The default for CSV processing will be trim whitespace from both ends + * (which can be overridden with the setTrimmer method). + * @param input the string to parse + * @return a new tokenizer instance which parses Tab Separated Value strings. + */ + public static StrTokenizer getTSVInstance(final char[] input) { + final StrTokenizer tok = getTSVClone(); + tok.reset(input); + return tok; + } + + //----------------------------------------------------------------------- + /** + * Constructs a tokenizer splitting on space, tab, newline and formfeed + * as per StringTokenizer, but with no text to tokenize. + * <p> + * This constructor is normally used with {@link #reset(String)}. + */ + public StrTokenizer() { + super(); + this.chars = null; + } + + /** + * Constructs a tokenizer splitting on space, tab, newline and formfeed + * as per StringTokenizer. + * + * @param input the string which is to be parsed + */ + public StrTokenizer(final String input) { + super(); + if (input != null) { + chars = input.toCharArray(); + } else { + chars = null; + } + } + + /** + * Constructs a tokenizer splitting on the specified delimiter character. + * + * @param input the string which is to be parsed + * @param delim the field delimiter character + */ + public StrTokenizer(final String input, final char delim) { + this(input); + setDelimiterChar(delim); + } + + /** + * Constructs a tokenizer splitting on the specified delimiter string. + * + * @param input the string which is to be parsed + * @param delim the field delimiter string + */ + public StrTokenizer(final String input, final String delim) { + this(input); + setDelimiterString(delim); + } + + /** + * Constructs a tokenizer splitting using the specified delimiter matcher. + * + * @param input the string which is to be parsed + * @param delim the field delimiter matcher + */ + public StrTokenizer(final String input, final StrMatcher delim) { + this(input); + setDelimiterMatcher(delim); + } + + /** + * Constructs a tokenizer splitting on the specified delimiter character + * and handling quotes using the specified quote character. + * + * @param input the string which is to be parsed + * @param delim the field delimiter character + * @param quote the field quoted string character + */ + public StrTokenizer(final String input, final char delim, final char quote) { + this(input, delim); + setQuoteChar(quote); + } + + /** + * Constructs a tokenizer splitting using the specified delimiter matcher + * and handling quotes using the specified quote matcher. + * + * @param input the string which is to be parsed + * @param delim the field delimiter matcher + * @param quote the field quoted string matcher + */ + public StrTokenizer(final String input, final StrMatcher delim, final StrMatcher quote) { + this(input, delim); + setQuoteMatcher(quote); + } + + /** + * Constructs a tokenizer splitting on space, tab, newline and formfeed + * as per StringTokenizer. + * + * @param input the string which is to be parsed, not cloned + */ + public StrTokenizer(final char[] input) { + super(); + if (input == null) { + this.chars = null; + } else { + this.chars = input.clone(); + } + } + + /** + * Constructs a tokenizer splitting on the specified character. + * + * @param input the string which is to be parsed, not cloned + * @param delim the field delimiter character + */ + public StrTokenizer(final char[] input, final char delim) { + this(input); + setDelimiterChar(delim); + } + + /** + * Constructs a tokenizer splitting on the specified string. + * + * @param input the string which is to be parsed, not cloned + * @param delim the field delimiter string + */ + public StrTokenizer(final char[] input, final String delim) { + this(input); + setDelimiterString(delim); + } + + /** + * Constructs a tokenizer splitting using the specified delimiter matcher. + * + * @param input the string which is to be parsed, not cloned + * @param delim the field delimiter matcher + */ + public StrTokenizer(final char[] input, final StrMatcher delim) { + this(input); + setDelimiterMatcher(delim); + } + + /** + * Constructs a tokenizer splitting on the specified delimiter character + * and handling quotes using the specified quote character. + * + * @param input the string which is to be parsed, not cloned + * @param delim the field delimiter character + * @param quote the field quoted string character + */ + public StrTokenizer(final char[] input, final char delim, final char quote) { + this(input, delim); + setQuoteChar(quote); + } + + /** + * Constructs a tokenizer splitting using the specified delimiter matcher + * and handling quotes using the specified quote matcher. + * + * @param input the string which is to be parsed, not cloned + * @param delim the field delimiter character + * @param quote the field quoted string character + */ + public StrTokenizer(final char[] input, final StrMatcher delim, final StrMatcher quote) { + this(input, delim); + setQuoteMatcher(quote); + } + + // API + //----------------------------------------------------------------------- + /** + * Gets the number of tokens found in the String. + * + * @return the number of matched tokens + */ + public int size() { + checkTokenized(); + return tokens.length; + } + + /** + * Gets the next token from the String. + * Equivalent to {@link #next()} except it returns null rather than + * throwing {@link NoSuchElementException} when no tokens remain. + * + * @return the next sequential token, or null when no more tokens are found + */ + public String nextToken() { + if (hasNext()) { + return tokens[tokenPos++]; + } + return null; + } + + /** + * Gets the previous token from the String. + * + * @return the previous sequential token, or null when no more tokens are found + */ + public String previousToken() { + if (hasPrevious()) { + return tokens[--tokenPos]; + } + return null; + } + + /** + * Gets a copy of the full token list as an independent modifiable array. + * + * @return the tokens as a String array + */ + public String[] getTokenArray() { + checkTokenized(); + return tokens.clone(); + } + + /** + * Gets a copy of the full token list as an independent modifiable list. + * + * @return the tokens as a String array + */ + public List<String> getTokenList() { + checkTokenized(); + final List<String> list = new ArrayList<>(tokens.length); + for (final String element : tokens) { + list.add(element); + } + return list; + } + + /** + * Resets this tokenizer, forgetting all parsing and iteration already completed. + * <p> + * This method allows the same tokenizer to be reused for the same String. + * + * @return this, to enable chaining + */ + public org.apache.commons.text.StrTokenizer reset() { + tokenPos = 0; + tokens = null; + return this; + } + + /** + * Reset this tokenizer, giving it a new input string to parse. + * In this manner you can re-use a tokenizer with the same settings + * on multiple input lines. + * + * @param input the new string to tokenize, null sets no text to parse + * @return this, to enable chaining + */ + public org.apache.commons.text.StrTokenizer reset(final String input) { + reset(); + if (input != null) { + this.chars = input.toCharArray(); + } else { + this.chars = null; + } + return this; + } + + /** + * Reset this tokenizer, giving it a new input string to parse. + * In this manner you can re-use a tokenizer with the same settings + * on multiple input lines. + * + * @param input the new character array to tokenize, not cloned, null sets no text to parse + * @return this, to enable chaining + */ + public org.apache.commons.text.StrTokenizer reset(final char[] input) { + reset(); + if (input != null) { + this.chars = input; + } else { + this.chars = null; + } + return this; + } + + // ListIterator + //----------------------------------------------------------------------- + /** + * Checks whether there are any more tokens. + * + * @return true if there are more tokens + */ + @Override + public boolean hasNext() { + checkTokenized(); + return tokenPos < tokens.length; + } + + /** + * Gets the next token. + * + * @return the next String token + * @throws NoSuchElementException if there are no more elements + */ + @Override + public String next() { + if (hasNext()) { + return tokens[tokenPos++]; + } + throw new NoSuchElementException(); + } + + /** + * Gets the index of the next token to return. + * + * @return the next token index + */ + @Override + public int nextIndex() { + return tokenPos; + } + + /** + * Checks whether there are any previous tokens that can be iterated to. + * + * @return true if there are previous tokens + */ + @Override + public boolean hasPrevious() { + checkTokenized(); + return tokenPos > 0; + } + + /** + * Gets the token previous to the last returned token. + * + * @return the previous token + */ + @Override + public String previous() { + if (hasPrevious()) { + return tokens[--tokenPos]; + } + throw new NoSuchElementException(); + } + + /** + * Gets the index of the previous token. + * + * @return the previous token index + */ + @Override + public int previousIndex() { + return tokenPos - 1; + } + + /** + * Unsupported ListIterator operation. + * + * @throws UnsupportedOperationException always + */ + @Override + public void remove() { + throw new UnsupportedOperationException("remove() is unsupported"); + } + + /** + * Unsupported ListIterator operation. + * @param obj this parameter ignored. + * @throws UnsupportedOperationException always + */ + @Override + public void set(final String obj) { + throw new UnsupportedOperationException("set() is unsupported"); + } + + /** + * Unsupported ListIterator operation. + * @param obj this parameter ignored. + * @throws UnsupportedOperationException always + */ + @Override + public void add(final String obj) { + throw new UnsupportedOperationException("add() is unsupported"); + } + + // Implementation + //----------------------------------------------------------------------- + /** + * Checks if tokenization has been done, and if not then do it. + */ + private void checkTokenized() { + if (tokens == null) { + if (chars == null) { + // still call tokenize as subclass may do some work + final List<String> split = tokenize(null, 0, 0); + tokens = split.toArray(new String[split.size()]); + } else { + final List<String> split = tokenize(chars, 0, chars.length); + tokens = split.toArray(new String[split.size()]); + } + } + } + + /** + * Internal method to performs the tokenization. + * <p> + * Most users of this class do not need to call this method. This method + * will be called automatically by other (public) methods when required. + * <p> + * This method exists to allow subclasses to add code before or after the + * tokenization. For example, a subclass could alter the character array, + * offset or count to be parsed, or call the tokenizer multiple times on + * multiple strings. It is also be possible to filter the results. + * <p> + * <code>StrTokenizer</code> will always pass a zero offset and a count + * equal to the length of the array to this method, however a subclass + * may pass other values, or even an entirely different array. + * + * @param srcChars the character array being tokenized, may be null + * @param offset the start position within the character array, must be valid + * @param count the number of characters to tokenize, must be valid + * @return the modifiable list of String tokens, unmodifiable if null array or zero count + */ + protected List<String> tokenize(final char[] srcChars, final int offset, final int count) { + if (srcChars == null || count == 0) { + return Collections.emptyList(); + } + final StrBuilder buf = new StrBuilder(); + final List<String> tokenList = new ArrayList<>(); + int pos = offset; + + // loop around the entire buffer + while (pos >= 0 && pos < count) { + // find next token + pos = readNextToken(srcChars, pos, count, buf, tokenList); + + // handle case where end of string is a delimiter + if (pos >= count) { + addToken(tokenList, ""); + } + } + return tokenList; + } + + /** + * Adds a token to a list, paying attention to the parameters we've set. + * + * @param list the list to add to + * @param tok the token to add + */ + private void addToken(final List<String> list, String tok) { + if (tok == null || tok.length() == 0) { + if (isIgnoreEmptyTokens()) { + return; + } + if (isEmptyTokenAsNull()) { + tok = null; + } + } + list.add(tok); + } + + /** + * Reads character by character through the String to get the next token. + * + * @param srcChars the character array being tokenized + * @param start the first character of field + * @param len the length of the character array being tokenized + * @param workArea a temporary work area + * @param tokenList the list of parsed tokens + * @return the starting position of the next field (the character + * immediately after the delimiter), or -1 if end of string found + */ + private int readNextToken(final char[] srcChars, int start, final int len, final StrBuilder workArea, final List<String> tokenList) { + // skip all leading whitespace, unless it is the + // field delimiter or the quote character + while (start < len) { + final int removeLen = Math.max( + getIgnoredMatcher().isMatch(srcChars, start, start, len), + getTrimmerMatcher().isMatch(srcChars, start, start, len)); + if (removeLen == 0 || + getDelimiterMatcher().isMatch(srcChars, start, start, len) > 0 || + getQuoteMatcher().isMatch(srcChars, start, start, len) > 0) { + break; + } + start += removeLen; + } + + // handle reaching end + if (start >= len) { + addToken(tokenList, ""); + return -1; + } + + // handle empty token + final int delimLen = getDelimiterMatcher().isMatch(srcChars, start, start, len); + if (delimLen > 0) { + addToken(tokenList, ""); + return start + delimLen; + } + + // handle found token + final int quoteLen = getQuoteMatcher().isMatch(srcChars, start, start, len); + if (quoteLen > 0) { + return readWithQuotes(srcChars, start + quoteLen, len, workArea, tokenList, start, quoteLen); + } + return readWithQuotes(srcChars, start, len, workArea, tokenList, 0, 0); + } + + /** + * Reads a possibly quoted string token. + * + * @param srcChars the character array being tokenized + * @param start the first character of field + * @param len the length of the character array being tokenized + * @param workArea a temporary work area + * @param tokenList the list of parsed tokens + * @param quoteStart the start position of the matched quote, 0 if no quoting + * @param quoteLen the length of the matched quote, 0 if no quoting + * @return the starting position of the next field (the character + * immediately after the delimiter, or if end of string found, + * then the length of string + */ + private int readWithQuotes(final char[] srcChars, final int start, final int len, final StrBuilder workArea, + final List<String> tokenList, final int quoteStart, final int quoteLen) { + // Loop until we've found the end of the quoted + // string or the end of the input + workArea.clear(); + int pos = start; + boolean quoting = quoteLen > 0; + int trimStart = 0; + + while (pos < len) { + // quoting mode can occur several times throughout a string + // we must switch between quoting and non-quoting until we + // encounter a non-quoted delimiter, or end of string + if (quoting) { + // In quoting mode + + // If we've found a quote character, see if it's + // followed by a second quote. If so, then we need + // to actually put the quote character into the token + // rather than end the token. + if (isQuote(srcChars, pos, len, quoteStart, quoteLen)) { + if (isQuote(srcChars, pos + quoteLen, len, quoteStart, quoteLen)) { + // matched pair of quotes, thus an escaped quote + workArea.append(srcChars, pos, quoteLen); + pos += quoteLen * 2; + trimStart = workArea.size(); + continue; + } + + // end of quoting + quoting = false; + pos += quoteLen; + continue; + } + + // copy regular character from inside quotes + workArea.append(srcChars[pos++]); + trimStart = workArea.size(); + + } else { + // Not in quoting mode + + // check for delimiter, and thus end of token + final int delimLen = getDelimiterMatcher().isMatch(srcChars, pos, start, len); + if (delimLen > 0) { + // return condition when end of token found + addToken(tokenList, workArea.substring(0, trimStart)); + return pos + delimLen; + } + + // check for quote, and thus back into quoting mode + if (quoteLen > 0 && isQuote(srcChars, pos, len, quoteStart, quoteLen)) { + quoting = true; + pos += quoteLen; + continue; + } + + // check for ignored (outside quotes), and ignore + final int ignoredLen = getIgnoredMatcher().isMatch(srcChars, pos, start, len); + if (ignoredLen > 0) { + pos += ignoredLen; + continue; + } + + // check for trimmed character + // don't yet know if its at the end, so copy to workArea + // use trimStart to keep track of trim at the end + final int trimmedLen = getTrimmerMatcher().isMatch(srcChars, pos, start, len); + if (trimmedLen > 0) { + workArea.append(srcChars, pos, trimmedLen); + pos += trimmedLen; + continue; + } + + // copy regular character from outside quotes + workArea.append(srcChars[pos++]); + trimStart = workArea.size(); + } + } + + // return condition when end of string found + addToken(tokenList, workArea.substring(0, trimStart)); + return -1; + } + + /** + * Checks if the characters at the index specified match the quote + * already matched in readNextToken(). + * + * @param srcChars the character array being tokenized + * @param pos the position to check for a quote + * @param len the length of the character array being tokenized + * @param quoteStart the start position of the matched quote, 0 if no quoting + * @param quoteLen the length of the matched quote, 0 if no quoting + * @return true if a quote is matched + */ + private boolean isQuote(final char[] srcChars, final int pos, final int len, final int quoteStart, final int quoteLen) { + for (int i = 0; i < quoteLen; i++) { + if (pos + i >= len || srcChars[pos + i] != srcChars[quoteStart + i]) { + return false; + } + } + return true; + } + + // Delimiter + //----------------------------------------------------------------------- + /** + * Gets the field delimiter matcher. + * + * @return the delimiter matcher in use + */ + public StrMatcher getDelimiterMatcher() { + return this.delimMatcher; + } + + /** + * Sets the field delimiter matcher. + * <p> + * The delimitier is used to separate one token from another. + * + * @param delim the delimiter matcher to use + * @return this, to enable chaining + */ + public StrTokenizer setDelimiterMatcher(final StrMatcher delim) { + if (delim == null) { + this.delimMatcher = StrMatcher.noneMatcher(); + } else { + this.delimMatcher = delim; + } + return this; + } + + /** + * Sets the field delimiter character. + * + * @param delim the delimiter character to use + * @return this, to enable chaining + */ + public StrTokenizer setDelimiterChar(final char delim) { + return setDelimiterMatcher(StrMatcher.charMatcher(delim)); + } + + /** + * Sets the field delimiter string. + * + * @param delim the delimiter string to use + * @return this, to enable chaining + */ + public StrTokenizer setDelimiterString(final String delim) { + return setDelimiterMatcher(StrMatcher.stringMatcher(delim)); + } + + // Quote + //----------------------------------------------------------------------- + /** + * Gets the quote matcher currently in use. + * <p> + * The quote character is used to wrap data between the tokens. + * This enables delimiters to be entered as data. + * The default value is '"' (double quote). + * + * @return the quote matcher in use + */ + public StrMatcher getQuoteMatcher() { + return quoteMatcher; + } + + /** + * Set the quote matcher to use. + * <p> + * The quote character is used to wrap data between the tokens. + * This enables delimiters to be entered as data. + * + * @param quote the quote matcher to use, null ignored + * @return this, to enable chaining + */ + public StrTokenizer setQuoteMatcher(final StrMatcher quote) { + if (quote != null) { + this.quoteMatcher = quote; + } + return this; + } + + /** + * Sets the quote character to use. + * <p> + * The quote character is used to wrap data between the tokens. + * This enables delimiters to be entered as data. + * + * @param quote the quote character to use + * @return this, to enable chaining + */ + public StrTokenizer setQuoteChar(final char quote) { + return setQuoteMatcher(StrMatcher.charMatcher(quote)); + } + + // Ignored + //----------------------------------------------------------------------- + /** + * Gets the ignored character matcher. + * <p> + * These characters are ignored when parsing the String, unless they are + * within a quoted region. + * The default value is not to ignore anything. + * + * @return the ignored matcher in use + */ + public StrMatcher getIgnoredMatcher() { + return ignoredMatcher; + } + + /** + * Set the matcher for characters to ignore. + * <p> + * These characters are ignored when parsing the String, unless they are + * within a quoted region. + * + * @param ignored the ignored matcher to use, null ignored + * @return this, to enable chaining + */ + public StrTokenizer setIgnoredMatcher(final StrMatcher ignored) { + if (ignored != null) { + this.ignoredMatcher = ignored; + } + return this; + } + + /** + * Set the character to ignore. + * <p> + * This character is ignored when parsing the String, unless it is + * within a quoted region. + * + * @param ignored the ignored character to use + * @return this, to enable chaining + */ + public StrTokenizer setIgnoredChar(final char ignored) { + return setIgnoredMatcher(StrMatcher.charMatcher(ignored)); + } + + // Trimmer + //----------------------------------------------------------------------- + /** + * Gets the trimmer character matcher. + * <p> + * These characters are trimmed off on each side of the delimiter + * until the token or quote is found. + * The default value is not to trim anything. + * + * @return the trimmer matcher in use + */ + public StrMatcher getTrimmerMatcher() { + return trimmerMatcher; + } + + /** + * Sets the matcher for characters to trim. + * <p> + * These characters are trimmed off on each side of the delimiter + * until the token or quote is found. + * + * @param trimmer the trimmer matcher to use, null ignored + * @return this, to enable chaining + */ + public StrTokenizer setTrimmerMatcher(final StrMatcher trimmer) { + if (trimmer != null) { + this.trimmerMatcher = trimmer; + } + return this; + } + + //----------------------------------------------------------------------- + /** + * Gets whether the tokenizer currently returns empty tokens as null. + * The default for this property is false. + * + * @return true if empty tokens are returned as null + */ + public boolean isEmptyTokenAsNull() { + return this.emptyAsNull; + } + + /** + * Sets whether the tokenizer should return empty tokens as null. + * The default for this property is false. + * + * @param emptyAsNull whether empty tokens are returned as null + * @return this, to enable chaining + */ + public StrTokenizer setEmptyTokenAsNull(final boolean emptyAsNull) { + this.emptyAsNull = emptyAsNull; + return this; + } + + //----------------------------------------------------------------------- + /** + * Gets whether the tokenizer currently ignores empty tokens. + * The default for this property is true. + * + * @return true if empty tokens are not returned + */ + public boolean isIgnoreEmptyTokens() { + return ignoreEmptyTokens; + } + + /** + * Sets whether the tokenizer should ignore and not return empty tokens. + * The default for this property is true. + * + * @param ignoreEmptyTokens whether empty tokens are not returned + * @return this, to enable chaining + */ + public StrTokenizer setIgnoreEmptyTokens(final boolean ignoreEmptyTokens) { + this.ignoreEmptyTokens = ignoreEmptyTokens; + return this; + } + + //----------------------------------------------------------------------- + /** + * Gets the String content that the tokenizer is parsing. + * + * @return the string content being parsed + */ + public String getContent() { + if (chars == null) { + return null; + } + return new String(chars); + } + + //----------------------------------------------------------------------- + /** + * Creates a new instance of this Tokenizer. The new instance is reset so + * that it will be at the start of the token list. + * If a {@link CloneNotSupportedException} is caught, return <code>null</code>. + * + * @return a new instance of this Tokenizer which has been reset. + */ + @Override + public Object clone() { + try { + return cloneReset(); + } catch (final CloneNotSupportedException ex) { + return null; + } + } + + /** + * Creates a new instance of this Tokenizer. The new instance is reset so that + * it will be at the start of the token list. + * + * @return a new instance of this Tokenizer which has been reset. + * @throws CloneNotSupportedException if there is a problem cloning + */ + Object cloneReset() throws CloneNotSupportedException { + // this method exists to enable 100% test coverage + final StrTokenizer cloned = (StrTokenizer) super.clone(); + if (cloned.chars != null) { + cloned.chars = cloned.chars.clone(); + } + cloned.reset(); + return cloned; + } + + //----------------------------------------------------------------------- + /** + * Gets the String content that the tokenizer is parsing. + * + * @return the string content being parsed + */ + @Override + public String toString() { + if (tokens == null) { + return "StrTokenizer[not tokenized yet]"; + } + return "StrTokenizer" + getTokenList(); + } + +}
http://git-wip-us.apache.org/repos/asf/commons-text/blob/6f24aa45/src/main/java/org/apache/commons/text/WordUtils.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/text/WordUtils.java b/src/main/java/org/apache/commons/text/WordUtils.java new file mode 100644 index 0000000..1d0085c --- /dev/null +++ b/src/main/java/org/apache/commons/text/WordUtils.java @@ -0,0 +1,733 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.text; + +import java.lang.reflect.Array; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +/** + * <p>Operations on Strings that contain words.</p> + * + * <p>This class tries to handle <code>null</code> input gracefully. + * An exception will not be thrown for a <code>null</code> input. + * Each method documents its behaviour in more detail.</p> + */ +public class WordUtils { + + /** + * <p><code>WordUtils</code> instances should NOT be constructed in + * standard programming. Instead, the class should be used as + * <code>WordUtils.wrap("foo bar", 20);</code>.</p> + * + * <p>This constructor is public to permit tools that require a JavaBean + * instance to operate.</p> + */ + public WordUtils() { + super(); + } + + // Wrapping + //-------------------------------------------------------------------------- + /** + * <p>Wraps a single line of text, identifying words by <code>' '</code>.</p> + * + * <p>New lines will be separated by the system property line separator. + * Very long words, such as URLs will <i>not</i> be wrapped.</p> + * + * <p>Leading spaces on a new line are stripped. + * Trailing spaces are not stripped.</p> + * + * <table border="1" summary="Wrap Results"> + * <tr> + * <th>input</th> + * <th>wrapLength</th> + * <th>result</th> + * </tr> + * <tr> + * <td>null</td> + * <td>*</td> + * <td>null</td> + * </tr> + * <tr> + * <td>""</td> + * <td>*</td> + * <td>""</td> + * </tr> + * <tr> + * <td>"Here is one line of text that is going to be wrapped after 20 columns."</td> + * <td>20</td> + * <td>"Here is one line of\ntext that is going\nto be wrapped after\n20 columns."</td> + * </tr> + * <tr> + * <td>"Click here to jump to the commons website - http://commons.apache.org"</td> + * <td>20</td> + * <td>"Click here to jump\nto the commons\nwebsite -\nhttp://commons.apache.org"</td> + * </tr> + * <tr> + * <td>"Click here, http://commons.apache.org, to jump to the commons website"</td> + * <td>20</td> + * <td>"Click here,\nhttp://commons.apache.org,\nto jump to the\ncommons website"</td> + * </tr> + * </table> + * + * (assuming that '\n' is the systems line separator) + * + * @param str the String to be word wrapped, may be null + * @param wrapLength the column to wrap the words at, less than 1 is treated as 1 + * @return a line with newlines inserted, <code>null</code> if null input + */ + public static String wrap(final String str, final int wrapLength) { + return wrap(str, wrapLength, null, false); + } + + /** + * <p>Wraps a single line of text, identifying words by <code>' '</code>.</p> + * + * <p>Leading spaces on a new line are stripped. + * Trailing spaces are not stripped.</p> + * + * <table border="1" summary="Wrap Results"> + * <tr> + * <th>input</th> + * <th>wrapLenght</th> + * <th>newLineString</th> + * <th>wrapLongWords</th> + * <th>result</th> + * </tr> + * <tr> + * <td>null</td> + * <td>*</td> + * <td>*</td> + * <td>true/false</td> + * <td>null</td> + * </tr> + * <tr> + * <td>""</td> + * <td>*</td> + * <td>*</td> + * <td>true/false</td> + * <td>""</td> + * </tr> + * <tr> + * <td>"Here is one line of text that is going to be wrapped after 20 columns."</td> + * <td>20</td> + * <td>"\n"</td> + * <td>true/false</td> + * <td>"Here is one line of\ntext that is going\nto be wrapped after\n20 columns."</td> + * </tr> + * <tr> + * <td>"Here is one line of text that is going to be wrapped after 20 columns."</td> + * <td>20</td> + * <td>"<br />"</td> + * <td>true/false</td> + * <td>"Here is one line of<br />text that is going<br />to be wrapped after<br />20 columns."</td> + * </tr> + * <tr> + * <td>"Here is one line of text that is going to be wrapped after 20 columns."</td> + * <td>20</td> + * <td>null</td> + * <td>true/false</td> + * <td>"Here is one line of" + systemNewLine + "text that is going" + systemNewLine + "to be wrapped after" + systemNewLine + "20 columns."</td> + * </tr> + * <tr> + * <td>"Click here to jump to the commons website - http://commons.apache.org"</td> + * <td>20</td> + * <td>"\n"</td> + * <td>false</td> + * <td>"Click here to jump\nto the commons\nwebsite -\nhttp://commons.apache.org"</td> + * </tr> + * <tr> + * <td>"Click here to jump to the commons website - http://commons.apache.org"</td> + * <td>20</td> + * <td>"\n"</td> + * <td>true</td> + * <td>"Click here to jump\nto the commons\nwebsite -\nhttp://commons.apach\ne.org"</td> + * </tr> + * </table> + * + * @param str the String to be word wrapped, may be null + * @param wrapLength the column to wrap the words at, less than 1 is treated as 1 + * @param newLineStr the string to insert for a new line, + * <code>null</code> uses the system property line separator + * @param wrapLongWords true if long words (such as URLs) should be wrapped + * @return a line with newlines inserted, <code>null</code> if null input + */ + public static String wrap(final String str, final int wrapLength, final String newLineStr, final boolean wrapLongWords) { + return wrap(str, wrapLength, newLineStr, wrapLongWords, " "); + } + + /** + * <p>Wraps a single line of text, identifying words by <code>wrapOn</code>.</p> + * + * <p>Leading spaces on a new line are stripped. + * Trailing spaces are not stripped.</p> + * + * <table border="1" summary="Wrap Results"> + * <tr> + * <th>input</th> + * <th>wrapLenght</th> + * <th>newLineString</th> + * <th>wrapLongWords</th> + * <th>wrapOn</th> + * <th>result</th> + * </tr> + * <tr> + * <td>null</td> + * <td>*</td> + * <td>*</td> + * <td>true/false</td> + * <td>*</td> + * <td>null</td> + * </tr> + * <tr> + * <td>""</td> + * <td>*</td> + * <td>*</td> + * <td>true/false</td> + * <td>*</td> + * <td>""</td> + * </tr> + * <tr> + * <td>"Here is one line of text that is going to be wrapped after 20 columns."</td> + * <td>20</td> + * <td>"\n"</td> + * <td>true/false</td> + * <td>" "</td> + * <td>"Here is one line of\ntext that is going\nto be wrapped after\n20 columns."</td> + * </tr> + * <tr> + * <td>"Here is one line of text that is going to be wrapped after 20 columns."</td> + * <td>20</td> + * <td>"<br />"</td> + * <td>true/false</td> + * <td>" "</td> + * <td>"Here is one line of<br />text that is going<br />to be wrapped after<br />20 columns."</td> + * </tr> + * <tr> + * <td>"Here is one line of text that is going to be wrapped after 20 columns."</td> + * <td>20</td> + * <td>null</td> + * <td>true/false</td> + * <td>" "</td> + * <td>"Here is one line of" + systemNewLine + "text that is going" + systemNewLine + "to be wrapped after" + systemNewLine + "20 columns."</td> + * </tr> + * <tr> + * <td>"Click here to jump to the commons website - http://commons.apache.org"</td> + * <td>20</td> + * <td>"\n"</td> + * <td>false</td> + * <td>" "</td> + * <td>"Click here to jump\nto the commons\nwebsite -\nhttp://commons.apache.org"</td> + * </tr> + * <tr> + * <td>"Click here to jump to the commons website - http://commons.apache.org"</td> + * <td>20</td> + * <td>"\n"</td> + * <td>true</td> + * <td>" "</td> + * <td>"Click here to jump\nto the commons\nwebsite -\nhttp://commons.apach\ne.org"</td> + * </tr> + * <tr> + * <td>"flammable/inflammable"</td> + * <td>20</td> + * <td>"\n"</td> + * <td>true</td> + * <td>"/"</td> + * <td>"flammable\ninflammable"</td> + * </tr> + * </table> + * @param str the String to be word wrapped, may be null + * @param wrapLength the column to wrap the words at, less than 1 is treated as 1 + * @param newLineStr the string to insert for a new line, + * <code>null</code> uses the system property line separator + * @param wrapLongWords true if long words (such as URLs) should be wrapped + * @param wrapOn regex expression to be used as a breakable characters, + * if blank string is provided a space character will be used + * @return a line with newlines inserted, <code>null</code> if null input + */ + public static String wrap(final String str, int wrapLength, String newLineStr, final boolean wrapLongWords, String wrapOn) { + if (str == null) { + return null; + } + if (newLineStr == null) { + newLineStr = System.getProperty("line.separator"); + } + if (wrapLength < 1) { + wrapLength = 1; + } + if (wrapOn == null || wrapOn.length() == 0 || wrapOn.trim().length() == 0) { + wrapOn = " "; + } + final Pattern patternToWrapOn = Pattern.compile(wrapOn); + final int inputLineLength = str.length(); + int offset = 0; + final StringBuilder wrappedLine = new StringBuilder(inputLineLength + 32); + + while (offset < inputLineLength) { + int spaceToWrapAt = -1; + Matcher matcher = patternToWrapOn.matcher(str.substring(offset, Math + .min(offset + wrapLength + 1, inputLineLength))); + if (matcher.find()) { + if (matcher.start() == 0) { + offset += matcher.end(); + continue; + }else { + spaceToWrapAt = matcher.start(); + } + } + + // only last line without leading spaces is left + if(inputLineLength - offset <= wrapLength) { + break; + } + + while(matcher.find()){ + spaceToWrapAt = matcher.start() + offset; + } + + if (spaceToWrapAt >= offset) { + // normal case + wrappedLine.append(str.substring(offset, spaceToWrapAt)); + wrappedLine.append(newLineStr); + offset = spaceToWrapAt + 1; + + } else { + // really long word or URL + if (wrapLongWords) { + // wrap really long word one line at a time + wrappedLine.append(str.substring(offset, wrapLength + offset)); + wrappedLine.append(newLineStr); + offset += wrapLength; + } else { + // do not wrap really long word, just extend beyond limit + matcher = patternToWrapOn.matcher(str.substring(offset + wrapLength)); + if (matcher.find()) { + spaceToWrapAt = matcher.start() + offset + wrapLength; + } + + if (spaceToWrapAt >= 0) { + wrappedLine.append(str.substring(offset, spaceToWrapAt)); + wrappedLine.append(newLineStr); + offset = spaceToWrapAt + 1; + } else { + wrappedLine.append(str.substring(offset)); + offset = inputLineLength; + } + } + } + } + + // Whatever is left in line is short enough to just pass through + wrappedLine.append(str.substring(offset)); + + return wrappedLine.toString(); + } + + // Capitalizing + //----------------------------------------------------------------------- + /** + * <p>Capitalizes all the whitespace separated words in a String. + * Only the first character of each word is changed. To convert the + * rest of each word to lowercase at the same time, + * use {@link #capitalizeFully(String)}.</p> + * + * <p>Whitespace is defined by {@link Character#isWhitespace(char)}. + * A <code>null</code> input String returns <code>null</code>. + * Capitalization uses the Unicode title case, normally equivalent to + * upper case.</p> + * + * <pre> + * WordUtils.capitalize(null) = null + * WordUtils.capitalize("") = "" + * WordUtils.capitalize("i am FINE") = "I Am FINE" + * </pre> + * + * @param str the String to capitalize, may be null + * @return capitalized String, <code>null</code> if null String input + * @see #uncapitalize(String) + * @see #capitalizeFully(String) + */ + public static String capitalize(final String str) { + return capitalize(str, null); + } + + /** + * <p>Capitalizes all the delimiter separated words in a String. + * Only the first character of each word is changed. To convert the + * rest of each word to lowercase at the same time, + * use {@link #capitalizeFully(String, char[])}.</p> + * + * <p>The delimiters represent a set of characters understood to separate words. + * The first string character and the first non-delimiter character after a + * delimiter will be capitalized. </p> + * + * <p>A <code>null</code> input String returns <code>null</code>. + * Capitalization uses the Unicode title case, normally equivalent to + * upper case.</p> + * + * <pre> + * WordUtils.capitalize(null, *) = null + * WordUtils.capitalize("", *) = "" + * WordUtils.capitalize(*, new char[0]) = * + * WordUtils.capitalize("i am fine", null) = "I Am Fine" + * WordUtils.capitalize("i aM.fine", {'.'}) = "I aM.Fine" + * </pre> + * + * @param str the String to capitalize, may be null + * @param delimiters set of characters to determine capitalization, null means whitespace + * @return capitalized String, <code>null</code> if null String input + * @see #uncapitalize(String) + * @see #capitalizeFully(String) + * @since 2.1 + */ + public static String capitalize(final String str, final char... delimiters) { + final int delimLen = delimiters == null ? -1 : delimiters.length; + if (str == null || str.length() == 0 || delimLen == 0) { + return str; + } + final char[] buffer = str.toCharArray(); + boolean capitalizeNext = true; + for (int i = 0; i < buffer.length; i++) { + final char ch = buffer[i]; + if (isDelimiter(ch, delimiters)) { + capitalizeNext = true; + } else if (capitalizeNext) { + buffer[i] = Character.toTitleCase(ch); + capitalizeNext = false; + } + } + return new String(buffer); + } + + //----------------------------------------------------------------------- + /** + * <p>Converts all the whitespace separated words in a String into capitalized words, + * that is each word is made up of a titlecase character and then a series of + * lowercase characters. </p> + * + * <p>Whitespace is defined by {@link Character#isWhitespace(char)}. + * A <code>null</code> input String returns <code>null</code>. + * Capitalization uses the Unicode title case, normally equivalent to + * upper case.</p> + * + * <pre> + * WordUtils.capitalizeFully(null) = null + * WordUtils.capitalizeFully("") = "" + * WordUtils.capitalizeFully("i am FINE") = "I Am Fine" + * </pre> + * + * @param str the String to capitalize, may be null + * @return capitalized String, <code>null</code> if null String input + */ + public static String capitalizeFully(final String str) { + return capitalizeFully(str, null); + } + + /** + * <p>Converts all the delimiter separated words in a String into capitalized words, + * that is each word is made up of a titlecase character and then a series of + * lowercase characters. </p> + * + * <p>The delimiters represent a set of characters understood to separate words. + * The first string character and the first non-delimiter character after a + * delimiter will be capitalized. </p> + * + * <p>A <code>null</code> input String returns <code>null</code>. + * Capitalization uses the Unicode title case, normally equivalent to + * upper case.</p> + * + * <pre> + * WordUtils.capitalizeFully(null, *) = null + * WordUtils.capitalizeFully("", *) = "" + * WordUtils.capitalizeFully(*, null) = * + * WordUtils.capitalizeFully(*, new char[0]) = * + * WordUtils.capitalizeFully("i aM.fine", {'.'}) = "I am.Fine" + * </pre> + * + * @param str the String to capitalize, may be null + * @param delimiters set of characters to determine capitalization, null means whitespace + * @return capitalized String, <code>null</code> if null String input + * @since 2.1 + */ + public static String capitalizeFully(String str, final char... delimiters) { + final int delimLen = delimiters == null ? -1 : delimiters.length; + if (str == null || str.length() == 0 || delimLen == 0) { + return str; + } + str = str.toLowerCase(); + return capitalize(str, delimiters); + } + + //----------------------------------------------------------------------- + /** + * <p>Uncapitalizes all the whitespace separated words in a String. + * Only the first character of each word is changed.</p> + * + * <p>Whitespace is defined by {@link Character#isWhitespace(char)}. + * A <code>null</code> input String returns <code>null</code>.</p> + * + * <pre> + * WordUtils.uncapitalize(null) = null + * WordUtils.uncapitalize("") = "" + * WordUtils.uncapitalize("I Am FINE") = "i am fINE" + * </pre> + * + * @param str the String to uncapitalize, may be null + * @return uncapitalized String, <code>null</code> if null String input + * @see #capitalize(String) + */ + public static String uncapitalize(final String str) { + return uncapitalize(str, null); + } + + /** + * <p>Uncapitalizes all the whitespace separated words in a String. + * Only the first character of each word is changed.</p> + * + * <p>The delimiters represent a set of characters understood to separate words. + * The first string character and the first non-delimiter character after a + * delimiter will be uncapitalized. </p> + * + * <p>Whitespace is defined by {@link Character#isWhitespace(char)}. + * A <code>null</code> input String returns <code>null</code>.</p> + * + * <pre> + * WordUtils.uncapitalize(null, *) = null + * WordUtils.uncapitalize("", *) = "" + * WordUtils.uncapitalize(*, null) = * + * WordUtils.uncapitalize(*, new char[0]) = * + * WordUtils.uncapitalize("I AM.FINE", {'.'}) = "i AM.fINE" + * </pre> + * + * @param str the String to uncapitalize, may be null + * @param delimiters set of characters to determine uncapitalization, null means whitespace + * @return uncapitalized String, <code>null</code> if null String input + * @see #capitalize(String) + * @since 2.1 + */ + public static String uncapitalize(final String str, final char... delimiters) { + final int delimLen = delimiters == null ? -1 : delimiters.length; + if (str == null || str.length() == 0 || delimLen == 0) { + return str; + } + final char[] buffer = str.toCharArray(); + boolean uncapitalizeNext = true; + for (int i = 0; i < buffer.length; i++) { + final char ch = buffer[i]; + if (isDelimiter(ch, delimiters)) { + uncapitalizeNext = true; + } else if (uncapitalizeNext) { + buffer[i] = Character.toLowerCase(ch); + uncapitalizeNext = false; + } + } + return new String(buffer); + } + + //----------------------------------------------------------------------- + /** + * <p>Swaps the case of a String using a word based algorithm.</p> + * + * <ul> + * <li>Upper case character converts to Lower case</li> + * <li>Title case character converts to Lower case</li> + * <li>Lower case character after Whitespace or at start converts to Title case</li> + * <li>Other Lower case character converts to Upper case</li> + * </ul> + * + * <p>Whitespace is defined by {@link Character#isWhitespace(char)}. + * A <code>null</code> input String returns <code>null</code>.</p> + * + * <pre> + * StringUtils.swapCase(null) = null + * StringUtils.swapCase("") = "" + * StringUtils.swapCase("The dog has a BONE") = "tHE DOG HAS A bone" + * </pre> + * + * @param str the String to swap case, may be null + * @return the changed String, <code>null</code> if null String input + */ + public static String swapCase(final String str) { + if (str == null || str.length() == 0) { + return str; + } + final char[] buffer = str.toCharArray(); + + boolean whitespace = true; + + for (int i = 0; i < buffer.length; i++) { + final char ch = buffer[i]; + if (Character.isUpperCase(ch)) { + buffer[i] = Character.toLowerCase(ch); + whitespace = false; + } else if (Character.isTitleCase(ch)) { + buffer[i] = Character.toLowerCase(ch); + whitespace = false; + } else if (Character.isLowerCase(ch)) { + if (whitespace) { + buffer[i] = Character.toTitleCase(ch); + whitespace = false; + } else { + buffer[i] = Character.toUpperCase(ch); + } + } else { + whitespace = Character.isWhitespace(ch); + } + } + return new String(buffer); + } + + //----------------------------------------------------------------------- + /** + * <p>Extracts the initial characters from each word in the String.</p> + * + * <p>All first characters after whitespace are returned as a new string. + * Their case is not changed.</p> + * + * <p>Whitespace is defined by {@link Character#isWhitespace(char)}. + * A <code>null</code> input String returns <code>null</code>.</p> + * + * <pre> + * WordUtils.initials(null) = null + * WordUtils.initials("") = "" + * WordUtils.initials("Ben John Lee") = "BJL" + * WordUtils.initials("Ben J.Lee") = "BJ" + * </pre> + * + * @param str the String to get initials from, may be null + * @return String of initial letters, <code>null</code> if null String input + * @see #initials(String,char[]) + * @since 2.2 + */ + public static String initials(final String str) { + return initials(str, null); + } + + /** + * <p>Extracts the initial characters from each word in the String.</p> + * + * <p>All first characters after the defined delimiters are returned as a new string. + * Their case is not changed.</p> + * + * <p>If the delimiters array is null, then Whitespace is used. + * Whitespace is defined by {@link Character#isWhitespace(char)}. + * A <code>null</code> input String returns <code>null</code>. + * An empty delimiter array returns an empty String.</p> + * + * <pre> + * WordUtils.initials(null, *) = null + * WordUtils.initials("", *) = "" + * WordUtils.initials("Ben John Lee", null) = "BJL" + * WordUtils.initials("Ben J.Lee", null) = "BJ" + * WordUtils.initials("Ben J.Lee", [' ','.']) = "BJL" + * WordUtils.initials(*, new char[0]) = "" + * </pre> + * + * @param str the String to get initials from, may be null + * @param delimiters set of characters to determine words, null means whitespace + * @return String of initial characters, <code>null</code> if null String input + * @see #initials(String) + * @since 2.2 + */ + public static String initials(final String str, final char... delimiters) { + if (str == null || str.length() == 0) { + return str; + } + if (delimiters != null && delimiters.length == 0) { + return ""; + } + final int strLen = str.length(); + final char[] buf = new char[strLen / 2 + 1]; + int count = 0; + boolean lastWasGap = true; + for (int i = 0; i < strLen; i++) { + final char ch = str.charAt(i); + + if (isDelimiter(ch, delimiters)) { + lastWasGap = true; + } else if (lastWasGap) { + buf[count++] = ch; + lastWasGap = false; + } else { + continue; // ignore ch + } + } + return new String(buf, 0, count); + } + + //----------------------------------------------------------------------- + /** + * <p>Checks if the String contains all words in the given array.</p> + * + * <p> + * A {@code null} String will return {@code false}. A {@code null}, zero + * length search array or if one element of array is null will return {@code false}. + * </p> + * + * <pre> + * WordUtils.containsAllWords(null, *) = false + * WordUtils.containsAllWords("", *) = false + * WordUtils.containsAllWords(*, null) = false + * WordUtils.containsAllWords(*, []) = false + * WordUtils.containsAllWords("abcd", "ab", "cd") = false + * WordUtils.containsAllWords("abc def", "def", "abc") = true + * </pre> + * + * + * @param word The CharSequence to check, may be null + * @param words The array of String words to search for, may be null + * @return {@code true} if all search words are found, {@code false} otherwise + * @since 3.5 + */ + public static boolean containsAllWords(final CharSequence word, final CharSequence... words) { + if (word == null || word.length() == 0 || words == null || Array.getLength(words) == 0) { + return false; + } + for (final CharSequence w : words) { + if (w == null || w.length() == 0 || String.valueOf(w).trim().length() == 0 ) { + return false; + } + final Pattern p = Pattern.compile(".*\\b" + w + "\\b.*"); + if (!p.matcher(word).matches()) { + return false; + } + } + return true; + } + + //----------------------------------------------------------------------- + /** + * Is the character a delimiter. + * + * @param ch the character to check + * @param delimiters the delimiters + * @return true if it is a delimiter + */ + private static boolean isDelimiter(final char ch, final char[] delimiters) { + if (delimiters == null) { + return Character.isWhitespace(ch); + } + for (final char delimiter : delimiters) { + if (ch == delimiter) { + return true; + } + } + return false; + } + +} http://git-wip-us.apache.org/repos/asf/commons-text/blob/6f24aa45/src/main/java/org/apache/commons/text/translate/AggregateTranslator.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/text/translate/AggregateTranslator.java b/src/main/java/org/apache/commons/text/translate/AggregateTranslator.java index d8b50e8..d209b3c 100644 --- a/src/main/java/org/apache/commons/text/translate/AggregateTranslator.java +++ b/src/main/java/org/apache/commons/text/translate/AggregateTranslator.java @@ -22,8 +22,6 @@ import java.io.Writer; /** * Executes a sequence of translators one after the other. Execution ends whenever * the first translator consumes codepoints from the input. - * - * @since 3.0 */ public class AggregateTranslator extends CharSequenceTranslator { http://git-wip-us.apache.org/repos/asf/commons-text/blob/6f24aa45/src/main/java/org/apache/commons/text/translate/CharSequenceTranslator.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/text/translate/CharSequenceTranslator.java b/src/main/java/org/apache/commons/text/translate/CharSequenceTranslator.java index 16ec02f..0fed939 100644 --- a/src/main/java/org/apache/commons/text/translate/CharSequenceTranslator.java +++ b/src/main/java/org/apache/commons/text/translate/CharSequenceTranslator.java @@ -25,8 +25,6 @@ import java.util.Locale; * An API for translating text. * Its core use is to escape and unescape text. Because escaping and unescaping * is completely contextual, the API does not present two separate signatures. - * - * @since 3.0 */ public abstract class CharSequenceTranslator { @@ -114,8 +112,8 @@ public abstract class CharSequenceTranslator { * @param translators CharSequenceTranslator array of translators to merge with this one * @return CharSequenceTranslator merging this translator with the others */ - public final org.apache.commons.text.translate.CharSequenceTranslator with(final org.apache.commons.text.translate.CharSequenceTranslator... translators) { - final org.apache.commons.text.translate.CharSequenceTranslator[] newArray = new org.apache.commons.text.translate.CharSequenceTranslator[translators.length + 1]; + public final CharSequenceTranslator with(final CharSequenceTranslator... translators) { + final CharSequenceTranslator[] newArray = new CharSequenceTranslator[translators.length + 1]; newArray[0] = this; System.arraycopy(translators, 0, newArray, 1, translators.length); return new AggregateTranslator(newArray); http://git-wip-us.apache.org/repos/asf/commons-text/blob/6f24aa45/src/main/java/org/apache/commons/text/translate/CodePointTranslator.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/text/translate/CodePointTranslator.java b/src/main/java/org/apache/commons/text/translate/CodePointTranslator.java index cac3d8f..c63165c 100644 --- a/src/main/java/org/apache/commons/text/translate/CodePointTranslator.java +++ b/src/main/java/org/apache/commons/text/translate/CodePointTranslator.java @@ -22,8 +22,6 @@ import java.io.Writer; /** * Helper subclass to CharSequenceTranslator to allow for translations that * will replace up to one character at a time. - * - * @since 3.0 */ public abstract class CodePointTranslator extends CharSequenceTranslator { http://git-wip-us.apache.org/repos/asf/commons-text/blob/6f24aa45/src/main/java/org/apache/commons/text/translate/EntityArrays.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/text/translate/EntityArrays.java b/src/main/java/org/apache/commons/text/translate/EntityArrays.java index 5c7c4e3..99626e6 100644 --- a/src/main/java/org/apache/commons/text/translate/EntityArrays.java +++ b/src/main/java/org/apache/commons/text/translate/EntityArrays.java @@ -20,8 +20,6 @@ package org.apache.commons.text.translate; * Class holding various entity data for HTML and XML - generally for use with * the LookupTranslator. * All arrays are of length [*][2]. - * - * @since 3.0 */ public class EntityArrays { http://git-wip-us.apache.org/repos/asf/commons-text/blob/6f24aa45/src/main/java/org/apache/commons/text/translate/JavaUnicodeEscaper.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/text/translate/JavaUnicodeEscaper.java b/src/main/java/org/apache/commons/text/translate/JavaUnicodeEscaper.java index a9a186e..8c5b2b6 100644 --- a/src/main/java/org/apache/commons/text/translate/JavaUnicodeEscaper.java +++ b/src/main/java/org/apache/commons/text/translate/JavaUnicodeEscaper.java @@ -18,8 +18,6 @@ package org.apache.commons.text.translate; /** * Translates codepoints to their Unicode escaped value suitable for Java source. - * - * @since 3.2 */ public class JavaUnicodeEscaper extends UnicodeEscaper { @@ -32,7 +30,7 @@ public class JavaUnicodeEscaper extends UnicodeEscaper { * above which to escape * @return the newly created {@code UnicodeEscaper} instance */ - public static org.apache.commons.text.translate.JavaUnicodeEscaper above(final int codepoint) { + public static JavaUnicodeEscaper above(final int codepoint) { return outsideOf(0, codepoint); } @@ -45,7 +43,7 @@ public class JavaUnicodeEscaper extends UnicodeEscaper { * below which to escape * @return the newly created {@code UnicodeEscaper} instance */ - public static org.apache.commons.text.translate.JavaUnicodeEscaper below(final int codepoint) { + public static JavaUnicodeEscaper below(final int codepoint) { return outsideOf(codepoint, Integer.MAX_VALUE); } @@ -60,8 +58,8 @@ public class JavaUnicodeEscaper extends UnicodeEscaper { * below which to escape * @return the newly created {@code UnicodeEscaper} instance */ - public static org.apache.commons.text.translate.JavaUnicodeEscaper between(final int codepointLow, final int codepointHigh) { - return new org.apache.commons.text.translate.JavaUnicodeEscaper(codepointLow, codepointHigh, true); + public static JavaUnicodeEscaper between(final int codepointLow, final int codepointHigh) { + return new JavaUnicodeEscaper(codepointLow, codepointHigh, true); } /** @@ -75,8 +73,8 @@ public class JavaUnicodeEscaper extends UnicodeEscaper { * above which to escape * @return the newly created {@code UnicodeEscaper} instance */ - public static org.apache.commons.text.translate.JavaUnicodeEscaper outsideOf(final int codepointLow, final int codepointHigh) { - return new org.apache.commons.text.translate.JavaUnicodeEscaper(codepointLow, codepointHigh, false); + public static JavaUnicodeEscaper outsideOf(final int codepointLow, final int codepointHigh) { + return new JavaUnicodeEscaper(codepointLow, codepointHigh, false); } /** http://git-wip-us.apache.org/repos/asf/commons-text/blob/6f24aa45/src/main/java/org/apache/commons/text/translate/LookupTranslator.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/text/translate/LookupTranslator.java b/src/main/java/org/apache/commons/text/translate/LookupTranslator.java index 614c86e..f73f312 100644 --- a/src/main/java/org/apache/commons/text/translate/LookupTranslator.java +++ b/src/main/java/org/apache/commons/text/translate/LookupTranslator.java @@ -23,8 +23,6 @@ import java.util.HashSet; /** * Translates a value using a lookup table. - * - * @since 3.0 */ public class LookupTranslator extends CharSequenceTranslator { http://git-wip-us.apache.org/repos/asf/commons-text/blob/6f24aa45/src/main/java/org/apache/commons/text/translate/NumericEntityEscaper.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/text/translate/NumericEntityEscaper.java b/src/main/java/org/apache/commons/text/translate/NumericEntityEscaper.java index a3bf24b..3e4bbd0 100644 --- a/src/main/java/org/apache/commons/text/translate/NumericEntityEscaper.java +++ b/src/main/java/org/apache/commons/text/translate/NumericEntityEscaper.java @@ -21,8 +21,6 @@ import java.io.Writer; /** * Translates codepoints to their XML numeric entity escaped value. - * - * @since 3.0 */ public class NumericEntityEscaper extends CodePointTranslator { @@ -59,7 +57,7 @@ public class NumericEntityEscaper extends CodePointTranslator { * @param codepoint below which to escape * @return the newly created {@code NumericEntityEscaper} instance */ - public static org.apache.commons.text.translate.NumericEntityEscaper below(final int codepoint) { + public static NumericEntityEscaper below(final int codepoint) { return outsideOf(codepoint, Integer.MAX_VALUE); } @@ -69,7 +67,7 @@ public class NumericEntityEscaper extends CodePointTranslator { * @param codepoint above which to escape * @return the newly created {@code NumericEntityEscaper} instance */ - public static org.apache.commons.text.translate.NumericEntityEscaper above(final int codepoint) { + public static NumericEntityEscaper above(final int codepoint) { return outsideOf(0, codepoint); } @@ -80,8 +78,8 @@ public class NumericEntityEscaper extends CodePointTranslator { * @param codepointHigh below which to escape * @return the newly created {@code NumericEntityEscaper} instance */ - public static org.apache.commons.text.translate.NumericEntityEscaper between(final int codepointLow, final int codepointHigh) { - return new org.apache.commons.text.translate.NumericEntityEscaper(codepointLow, codepointHigh, true); + public static NumericEntityEscaper between(final int codepointLow, final int codepointHigh) { + return new NumericEntityEscaper(codepointLow, codepointHigh, true); } /** @@ -91,8 +89,8 @@ public class NumericEntityEscaper extends CodePointTranslator { * @param codepointHigh above which to escape * @return the newly created {@code NumericEntityEscaper} instance */ - public static org.apache.commons.text.translate.NumericEntityEscaper outsideOf(final int codepointLow, final int codepointHigh) { - return new org.apache.commons.text.translate.NumericEntityEscaper(codepointLow, codepointHigh, false); + public static NumericEntityEscaper outsideOf(final int codepointLow, final int codepointHigh) { + return new NumericEntityEscaper(codepointLow, codepointHigh, false); } /** http://git-wip-us.apache.org/repos/asf/commons-text/blob/6f24aa45/src/main/java/org/apache/commons/text/translate/NumericEntityUnescaper.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/text/translate/NumericEntityUnescaper.java b/src/main/java/org/apache/commons/text/translate/NumericEntityUnescaper.java index cd4605d..6e2016e 100644 --- a/src/main/java/org/apache/commons/text/translate/NumericEntityUnescaper.java +++ b/src/main/java/org/apache/commons/text/translate/NumericEntityUnescaper.java @@ -26,8 +26,6 @@ import java.util.EnumSet; * the specific codepoint. * * Note that the semi-colon is optional. - * - * @since 3.0 */ public class NumericEntityUnescaper extends CharSequenceTranslator { http://git-wip-us.apache.org/repos/asf/commons-text/blob/6f24aa45/src/main/java/org/apache/commons/text/translate/OctalUnescaper.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/text/translate/OctalUnescaper.java b/src/main/java/org/apache/commons/text/translate/OctalUnescaper.java index 5801348..f9c0c14 100644 --- a/src/main/java/org/apache/commons/text/translate/OctalUnescaper.java +++ b/src/main/java/org/apache/commons/text/translate/OctalUnescaper.java @@ -26,8 +26,6 @@ import java.io.Writer; * * Note that this currently only supports the viable range of octal for Java; namely * 1 to 377. This is because parsing Java is the main use case. - * - * @since 3.0 */ public class OctalUnescaper extends CharSequenceTranslator { http://git-wip-us.apache.org/repos/asf/commons-text/blob/6f24aa45/src/main/java/org/apache/commons/text/translate/UnicodeEscaper.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/text/translate/UnicodeEscaper.java b/src/main/java/org/apache/commons/text/translate/UnicodeEscaper.java index 398a0e9..0c90f6d 100644 --- a/src/main/java/org/apache/commons/text/translate/UnicodeEscaper.java +++ b/src/main/java/org/apache/commons/text/translate/UnicodeEscaper.java @@ -21,8 +21,6 @@ import java.io.Writer; /** * Translates codepoints to their Unicode escaped value. - * - * @since 3.0 */ public class UnicodeEscaper extends CodePointTranslator { @@ -59,7 +57,7 @@ public class UnicodeEscaper extends CodePointTranslator { * @param codepoint below which to escape * @return the newly created {@code UnicodeEscaper} instance */ - public static org.apache.commons.text.translate.UnicodeEscaper below(final int codepoint) { + public static UnicodeEscaper below(final int codepoint) { return outsideOf(codepoint, Integer.MAX_VALUE); } @@ -69,7 +67,7 @@ public class UnicodeEscaper extends CodePointTranslator { * @param codepoint above which to escape * @return the newly created {@code UnicodeEscaper} instance */ - public static org.apache.commons.text.translate.UnicodeEscaper above(final int codepoint) { + public static UnicodeEscaper above(final int codepoint) { return outsideOf(0, codepoint); } @@ -80,8 +78,8 @@ public class UnicodeEscaper extends CodePointTranslator { * @param codepointHigh above which to escape * @return the newly created {@code UnicodeEscaper} instance */ - public static org.apache.commons.text.translate.UnicodeEscaper outsideOf(final int codepointLow, final int codepointHigh) { - return new org.apache.commons.text.translate.UnicodeEscaper(codepointLow, codepointHigh, false); + public static UnicodeEscaper outsideOf(final int codepointLow, final int codepointHigh) { + return new UnicodeEscaper(codepointLow, codepointHigh, false); } /** @@ -91,8 +89,8 @@ public class UnicodeEscaper extends CodePointTranslator { * @param codepointHigh below which to escape * @return the newly created {@code UnicodeEscaper} instance */ - public static org.apache.commons.text.translate.UnicodeEscaper between(final int codepointLow, final int codepointHigh) { - return new org.apache.commons.text.translate.UnicodeEscaper(codepointLow, codepointHigh, true); + public static UnicodeEscaper between(final int codepointLow, final int codepointHigh) { + return new UnicodeEscaper(codepointLow, codepointHigh, true); } /** http://git-wip-us.apache.org/repos/asf/commons-text/blob/6f24aa45/src/main/java/org/apache/commons/text/translate/UnicodeUnescaper.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/text/translate/UnicodeUnescaper.java b/src/main/java/org/apache/commons/text/translate/UnicodeUnescaper.java index b92f421..e8fda68 100644 --- a/src/main/java/org/apache/commons/text/translate/UnicodeUnescaper.java +++ b/src/main/java/org/apache/commons/text/translate/UnicodeUnescaper.java @@ -23,8 +23,6 @@ import java.io.Writer; * Translates escaped Unicode values of the form \\u+\d\d\d\d back to * Unicode. It supports multiple 'u' characters and will work with or * without the +. - * - * @since 3.0 */ public class UnicodeUnescaper extends CharSequenceTranslator { http://git-wip-us.apache.org/repos/asf/commons-text/blob/6f24aa45/src/main/java/org/apache/commons/text/translate/package-info.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/text/translate/package-info.java b/src/main/java/org/apache/commons/text/translate/package-info.java index 722d7c9..3ef4d48 100644 --- a/src/main/java/org/apache/commons/text/translate/package-info.java +++ b/src/main/java/org/apache/commons/text/translate/package-info.java @@ -18,7 +18,5 @@ * <p> An API for creating text translation routines from a set of smaller building blocks. Initially created to make it * possible for the user to customize the rules in the StringEscapeUtils class.</p> * <p>These classes are immutable, and therefore thread-safe.</p> - * - * @since 3.0 */ package org.apache.commons.text.translate; http://git-wip-us.apache.org/repos/asf/commons-text/blob/6f24aa45/src/test/java/org/apache/commons/text/CompositeFormatTest.java ---------------------------------------------------------------------- diff --git a/src/test/java/org/apache/commons/text/CompositeFormatTest.java b/src/test/java/org/apache/commons/text/CompositeFormatTest.java new file mode 100644 index 0000000..e9b8064 --- /dev/null +++ b/src/test/java/org/apache/commons/text/CompositeFormatTest.java @@ -0,0 +1,85 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.commons.text; + +import org.junit.Test; +import static org.junit.Assert.*; +import java.text.FieldPosition; +import java.text.Format; +import java.text.ParsePosition; +import java.text.SimpleDateFormat; +import java.util.Locale; + +/** + * Unit tests for {@link org.apache.commons.text.CompositeFormat}. + */ +public class CompositeFormatTest { + + /** + * Ensures that the parse/format separation is correctly maintained. + */ + @Test + public void testCompositeFormat() { + + final Format parser = new Format() { + private static final long serialVersionUID = 1L; + + @Override + public StringBuffer format(final Object obj, final StringBuffer toAppendTo, final FieldPosition pos) { + throw new UnsupportedOperationException("Not implemented"); + } + + @Override + public Object parseObject(final String source, final ParsePosition pos) { + return null; // do nothing + } + }; + + final Format formatter = new Format() { + private static final long serialVersionUID = 1L; + + @Override + public StringBuffer format(final Object obj, final StringBuffer toAppendTo, final FieldPosition pos) { + return null; // do nothing + } + + @Override + public Object parseObject(final String source, final ParsePosition pos) { + throw new UnsupportedOperationException("Not implemented"); + } + }; + + final CompositeFormat composite = new CompositeFormat(parser, formatter); + + composite.parseObject("", null); + composite.format(new Object(), new StringBuffer(), null); + assertEquals( "Parser get method incorrectly implemented", parser, composite.getParser() ); + assertEquals( "Formatter get method incorrectly implemented", formatter, composite.getFormatter() ); + } + + @Test + public void testUsage() throws Exception { + final Format f1 = new SimpleDateFormat("MMddyyyy", Locale.ENGLISH); + final Format f2 = new SimpleDateFormat("MMMM d, yyyy", Locale.ENGLISH); + final CompositeFormat c = new CompositeFormat(f1, f2); + final String testString = "January 3, 2005"; + assertEquals(testString, c.format(c.parseObject("01032005"))); + assertEquals(testString, c.reformat("01032005")); + } + +}