This is an automated email from the ASF dual-hosted git repository. ggregory pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/commons-csv.git
commit d8d080453e30ee732bbdd1448d6c2e307d0746f4 Author: Gary Gregory <garydgreg...@gmail.com> AuthorDate: Mon Jul 5 12:06:07 2021 -0400 Add support for String delimiters #76. Bump PMD core from 6.29.0 to 6.36.0. Fix rule set. --- pom.xml | 43 ++-- src/changes/changes.xml | 2 + .../java/org/apache/commons/csv/CSVFormat.java | 266 ++++++++++++++------- .../java/org/apache/commons/csv/Constants.java | 2 +- .../apache/commons/csv/ExtendedBufferedReader.java | 21 +- src/main/java/org/apache/commons/csv/Lexer.java | 108 +++++++-- src/site/resources/pmd/pmd-ruleset.xml | 9 +- .../java/org/apache/commons/csv/CSVParserTest.java | 78 ++++++ .../org/apache/commons/csv/CSVPrinterTest.java | 55 ++++- .../apache/commons/csv/issues/JiraCsv206Test.java | 69 ++++++ 10 files changed, 505 insertions(+), 148 deletions(-) diff --git a/pom.xml b/pom.xml index c83aa0c..8e9c455 100644 --- a/pom.xml +++ b/pom.xml @@ -172,6 +172,7 @@ <checkstyle.resourceExcludes>LICENSE.txt, NOTICE.txt, **/maven-archiver/pom.properties</checkstyle.resourceExcludes> <commons.pmd.version>3.14.0</commons.pmd.version> + <commons.pmd-impl.version>6.36.0</commons.pmd-impl.version> <commons.jacoco.version>0.8.7</commons.jacoco.version> <commons.spotbugs.version>4.2.3</commons.spotbugs.version> <commons.japicmp.version>0.15.3</commons.japicmp.version> @@ -203,6 +204,30 @@ </dependency> </dependencies> </plugin> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-pmd-plugin</artifactId> + <version>${commons.pmd.version}</version> + <dependencies> + <dependency> + <groupId>net.sourceforge.pmd</groupId> + <artifactId>pmd-core</artifactId> + <version>${commons.pmd-impl.version}</version> + </dependency> + <dependency> + <groupId>net.sourceforge.pmd</groupId> + <artifactId>pmd-java</artifactId> + <version>${commons.pmd-impl.version}</version> + </dependency> + </dependencies> + <configuration> + <targetJdk>${maven.compiler.target}</targetJdk> + <skipEmptyReport>false</skipEmptyReport> + <rulesets> + <ruleset>${basedir}/src/site/resources/pmd/pmd-ruleset.xml</ruleset> + </rulesets> + </configuration> + </plugin> </plugins> </pluginManagement> <plugins> @@ -250,15 +275,6 @@ <plugin> <groupId>org.apache.maven.plugins</groupId> <artifactId>maven-pmd-plugin</artifactId> - <version>${commons.pmd.version}</version> - <configuration> - <targetJdk>${maven.compiler.target}</targetJdk> - <skipEmptyReport>false</skipEmptyReport> - <analysisCache>true</analysisCache> - <rulesets> - <ruleset>${basedir}/src/site/resources/pmd/pmd-ruleset.xml</ruleset> - </rulesets> - </configuration> </plugin> <!-- We need to add our test data files to rat exclusions --> @@ -322,15 +338,6 @@ <plugin> <groupId>org.apache.maven.plugins</groupId> <artifactId>maven-pmd-plugin</artifactId> - <version>${commons.pmd.version}</version> - <configuration> - <targetJdk>${maven.compiler.target}</targetJdk> - <skipEmptyReport>false</skipEmptyReport> - <analysisCache>true</analysisCache> - <rulesets> - <ruleset>${basedir}/src/site/resources/pmd/pmd-ruleset.xml</ruleset> - </rulesets> - </configuration> </plugin> <plugin> <groupId>org.codehaus.mojo</groupId> diff --git a/src/changes/changes.xml b/src/changes/changes.xml index 4508d0c..74601d0 100644 --- a/src/changes/changes.xml +++ b/src/changes/changes.xml @@ -65,6 +65,7 @@ <action issue="CSV-184" type="add" dev="ggregory" due-to="Gaurav Agarwal, M. Steiger, Gary Gregory">Make the method CSVRecord.putIn(Map) public.</action> <action type="add" dev="ggregory" due-to="dota17">Add test cases for CSVRecord with get(Enum) and toString. #54.</action> <action type="add" dev="ggregory" due-to="Gary Gregory, dota17">Add and use CSVFormat.Builder, deprecated CSVFormat#with methods, based on #73.</action> + <action issue="CSV-206" type="add" dev="ggregory" due-to="Gary Gregory, dota17">Add support for String delimiters #76.</action> <!-- UPDATE --> <action type="update" dev="ggregory" due-to="Gary Gregory">Update org.junit.jupiter:junit-jupiter from 5.6.0 to 5.7.0, #84 #109</action> <action type="update" dev="ggregory" due-to="Gary Gregory">Update tests from Apache Commons Lang 3.9 to 3.12.0.</action> @@ -84,6 +85,7 @@ <action type="update" dev="ggregory" due-to="Dependabot">Bump commons.spotbugs.version from 4.0.4 to 4.2.3 (Java 16).</action> <action type="update" dev="ggregory" due-to="Gary Gregory">Bump commons.javadoc.version from 3.2.0 to 3.3.0.</action> <action type="update" dev="ggregory" due-to="Dependabot">Bump jmh-generator-annprocess from 1.5.2 to 1.32 #151.</action> + <action type="update" dev="ggregory" due-to="Dependabot">Bump PMD core from 6.29.0 to 6.36.0.</action> </release> <release version="1.8" date="2020-02-01" description="Feature and bug fix release (Java 8). diff --git a/src/main/java/org/apache/commons/csv/CSVFormat.java b/src/main/java/org/apache/commons/csv/CSVFormat.java index 9b50ee6..e623dfe 100644 --- a/src/main/java/org/apache/commons/csv/CSVFormat.java +++ b/src/main/java/org/apache/commons/csv/CSVFormat.java @@ -196,14 +196,14 @@ public final class CSVFormat implements Serializable { private Character commentMarker; - private char delimiter; + private String delimiter; private Character escapeCharacter; - private String[] headers; - private String[] headerComments; + private String[] headers; + private boolean ignoreEmptyLines; private boolean ignoreHeaderCase; @@ -330,7 +330,17 @@ public final class CSVFormat implements Serializable { * @return This instance. */ public Builder setDelimiter(final char delimiter) { - if (isLineBreak(delimiter)) { + return setDelimiter(String.valueOf(delimiter)); + } + + /** + * Sets the delimiter character. + * + * @param delimiter the delimiter character. + * @return This instance. + */ + public Builder setDelimiter(final String delimiter) { + if (containsLineBreak(delimiter)) { throw new IllegalArgumentException("The delimiter cannot be a line break"); } this.delimiter = delimiter; @@ -1147,11 +1157,33 @@ public final class CSVFormat implements Serializable { } /** + * Returns true if the given string contains the search char. + * + * @param source the string to check. + * + * @return true if {@code c} contains a line break character + */ + private static boolean contains(final String source, final char searchCh) { + return Objects.requireNonNull(source, "source").indexOf(searchCh) >= 0; + } + + /** + * Returns true if the given string contains a line break character. + * + * @param source the string to check. + * + * @return true if {@code c} contains a line break character. + */ + private static boolean containsLineBreak(final String source) { + return contains(source, CR) || contains(source, LF); + } + + /** * Returns true if the given character is a line break character. * - * @param c the character to check + * @param c the character to check. * - * @return true if {@code c} is a line break character + * @return true if {@code c} is a line break character. */ private static boolean isLineBreak(final char c) { return c == LF || c == CR; @@ -1160,9 +1192,9 @@ public final class CSVFormat implements Serializable { /** * Returns true if the given character is a line break character. * - * @param c the character to check, may be null + * @param c the character to check, may be null. * - * @return true if {@code c} is a line break character (and not null) + * @return true if {@code c} is a line break character (and not null). */ private static boolean isLineBreak(final Character c) { return c != null && isLineBreak(c.charValue()); @@ -1186,7 +1218,8 @@ public final class CSVFormat implements Serializable { * @see #TDF */ public static CSVFormat newFormat(final char delimiter) { - return new CSVFormat(delimiter, null, null, null, null, false, false, null, null, null, null, false, false, false, false, false, false, true); + return new CSVFormat(String.valueOf(delimiter), null, null, null, null, false, false, null, null, null, null, false, false, false, false, false, false, + true); } static String[] toStringArray(final Object[] values) { @@ -1195,8 +1228,7 @@ public final class CSVFormat implements Serializable { } final String[] strings = new String[values.length]; for (int i = 0; i < values.length; i++) { - final Object value = values[i]; - strings[i] = value == null ? null : value.toString(); + strings[i] = Objects.toString(values[i], null); } return strings; } @@ -1237,7 +1269,7 @@ public final class CSVFormat implements Serializable { private final Character commentMarker; // null if commenting is disabled - private final char delimiter; + private final String delimiter; private final Character escapeCharacter; // null if escaping is disabled @@ -1312,7 +1344,7 @@ public final class CSVFormat implements Serializable { * @param autoFlush TODO Doc me. * @throws IllegalArgumentException if the delimiter is a line break character. */ - private CSVFormat(final char delimiter, final Character quoteChar, final QuoteMode quoteMode, final Character commentStart, final Character escape, + private CSVFormat(final String delimiter, final Character quoteChar, final QuoteMode quoteMode, final Character commentStart, final Character escape, final boolean ignoreSurroundingSpaces, final boolean ignoreEmptyLines, final String recordSeparator, final String nullString, final Object[] headerComments, final String[] header, final boolean skipHeaderRecord, final boolean allowMissingColumnNames, final boolean ignoreHeaderCase, final boolean trim, final boolean trailingDelimiter, final boolean autoFlush, @@ -1353,30 +1385,18 @@ public final class CSVFormat implements Serializable { if (this == obj) { return true; } - if ((obj == null) || (getClass() != obj.getClass())) { + if (obj == null || getClass() != obj.getClass()) { return false; } - final CSVFormat other = (CSVFormat) obj; - if ((delimiter != other.delimiter) || (trailingDelimiter != other.trailingDelimiter) || (autoFlush != other.autoFlush) || (trim != other.trim)) { - return false; - } - if ((allowMissingColumnNames != other.allowMissingColumnNames) || (allowDuplicateHeaderNames != other.allowDuplicateHeaderNames) || - (ignoreHeaderCase != other.ignoreHeaderCase) || (quoteMode != other.quoteMode)) { - return false; - } - if (!Objects.equals(quoteCharacter, other.quoteCharacter) || !Objects.equals(commentMarker, other.commentMarker) || - !Objects.equals(escapeCharacter, other.escapeCharacter) || !Objects.equals(nullString, other.nullString)) { - return false; - } - if (!Arrays.equals(header, other.header) || (ignoreSurroundingSpaces != other.ignoreSurroundingSpaces) || - (ignoreEmptyLines != other.ignoreEmptyLines) || (skipHeaderRecord != other.skipHeaderRecord)) { - return false; - } - if (!Objects.equals(recordSeparator, other.recordSeparator) || !Arrays.equals(headerComments, other.headerComments)) { - return false; - } - return true; + return allowDuplicateHeaderNames == other.allowDuplicateHeaderNames && allowMissingColumnNames == other.allowMissingColumnNames && + autoFlush == other.autoFlush && Objects.equals(commentMarker, other.commentMarker) && Objects.equals(delimiter, other.delimiter) && + Objects.equals(escapeCharacter, other.escapeCharacter) && Arrays.equals(header, other.header) && + Arrays.equals(headerComments, other.headerComments) && ignoreEmptyLines == other.ignoreEmptyLines && + ignoreHeaderCase == other.ignoreHeaderCase && ignoreSurroundingSpaces == other.ignoreSurroundingSpaces && + Objects.equals(nullString, other.nullString) && Objects.equals(quoteCharacter, other.quoteCharacter) && quoteMode == other.quoteMode && + Objects.equals(quotedNullString, other.quotedNullString) && Objects.equals(recordSeparator, other.recordSeparator) && + skipHeaderRecord == other.skipHeaderRecord && trailingDelimiter == other.trailingDelimiter && trim == other.trim; } /** @@ -1437,11 +1457,22 @@ public final class CSVFormat implements Serializable { } /** - * Returns the character delimiting the values (typically ';', ',' or '\t'). + * Returns the first character delimiting the values (typically ';', ',' or '\t'). * - * @return the delimiter character + * @return the first delimiter character. + * @deprecated Use {@link #getDelimiterString()}. */ + @Deprecated public char getDelimiter() { + return delimiter.charAt(0); + } + + /** + * Returns the character delimiting the values (typically ";", "," or "\t"). + * + * @return the delimiter. + */ + public String getDelimiterString() { return delimiter; } @@ -1571,9 +1602,14 @@ public final class CSVFormat implements Serializable { @Override public int hashCode() { - return Objects.hash(delimiter, quoteMode, quoteCharacter, commentMarker, escapeCharacter, nullString, ignoreSurroundingSpaces, ignoreHeaderCase, - ignoreEmptyLines, skipHeaderRecord, allowDuplicateHeaderNames, trim, autoFlush, trailingDelimiter, allowMissingColumnNames, recordSeparator, - Arrays.hashCode(header), Arrays.hashCode(headerComments)); + final int prime = 31; + int result = 1; + result = prime * result + Arrays.hashCode(header); + result = prime * result + Arrays.hashCode(headerComments); + result = prime * result + Objects.hash(allowDuplicateHeaderNames, allowMissingColumnNames, autoFlush, commentMarker, delimiter, escapeCharacter, + ignoreEmptyLines, ignoreHeaderCase, ignoreSurroundingSpaces, nullString, quoteCharacter, quoteMode, quotedNullString, recordSeparator, + skipHeaderRecord, trailingDelimiter, trim); + return result; } /** @@ -1588,6 +1624,37 @@ public final class CSVFormat implements Serializable { } /** + * Matches whether the next characters constitute a delimiter + * + * @param ch + * the current char + * @param charSeq + * the match char sequence + * @param startIndex + * where start to match + * @param delimiter + * the delimiter + * @param delimiterLength + * the delimiter length + * @return true if the match is successful + */ + private boolean isDelimiter(final char ch, final CharSequence charSeq, final int startIndex, final char[] delimiter, final int delimiterLength) { + if (ch != delimiter[0]) { + return false; + } + final int len = charSeq.length(); + if (startIndex + delimiterLength > len) { + return false; + } + for (int i = 1; i < delimiterLength; i++) { + if (charSeq.charAt(startIndex + i) != delimiter[i]) { + return false; + } + } + return true; + } + + /** * Returns whether escape are being processed. * * @return {@code true} if escapes are processed @@ -1702,7 +1769,7 @@ public final class CSVFormat implements Serializable { final int offset = 0; final int len = value.length(); if (!newRecord) { - out.append(getDelimiter()); + out.append(getDelimiterString()); } if (object == null) { out.append(value); @@ -1737,7 +1804,7 @@ public final class CSVFormat implements Serializable { private void print(final Reader reader, final Appendable out, final boolean newRecord) throws IOException { // Reader is never null if (!newRecord) { - out.append(getDelimiter()); + out.append(getDelimiterString()); } if (isQuoteCharacterSet()) { printWithQuotes(reader, out); @@ -1769,16 +1836,16 @@ public final class CSVFormat implements Serializable { /** * Outputs the trailing delimiter (if set) followed by the record separator (if set). * - * @param out where to write + * @param appendable where to write * @throws IOException If an I/O error occurs * @since 1.4 */ - public void println(final Appendable out) throws IOException { + public void println(final Appendable appendable) throws IOException { if (getTrailingDelimiter()) { - out.append(getDelimiter()); + appendable.append(getDelimiterString()); } if (recordSeparator != null) { - out.append(recordSeparator); + appendable.append(recordSeparator); } } @@ -1790,35 +1857,37 @@ public final class CSVFormat implements Serializable { * the record, so there is no need to call {@link #println(Appendable)}. * </p> * - * @param out where to write. + * @param appendable where to write. * @param values values to output. * @throws IOException If an I/O error occurs. * @since 1.4 */ - public void printRecord(final Appendable out, final Object... values) throws IOException { + public void printRecord(final Appendable appendable, final Object... values) throws IOException { for (int i = 0; i < values.length; i++) { - print(values[i], out, i == 0); + print(values[i], appendable, i == 0); } - println(out); + println(appendable); } /* - * Note: must only be called if escaping is enabled, otherwise will generate NPE + * Note: Must only be called if escaping is enabled, otherwise will generate NPE. */ - private void printWithEscapes(final CharSequence value, final Appendable out) throws IOException { + private void printWithEscapes(final CharSequence charSeq, final Appendable appendable) throws IOException { int start = 0; int pos = 0; - final int end = value.length(); + final int end = charSeq.length(); - final char delim = getDelimiter(); + final char[] delim = getDelimiterString().toCharArray(); + final int delimLength = delim.length; final char escape = getEscapeCharacter().charValue(); while (pos < end) { - char c = value.charAt(pos); - if (c == CR || c == LF || c == delim || c == escape) { + char c = charSeq.charAt(pos); + boolean isDelimiterStart = isDelimiter(c, charSeq, pos, delim, delimLength); + if (c == CR || c == LF || c == escape || isDelimiterStart) { // write out segment up until this char if (pos > start) { - out.append(value, start, pos); + appendable.append(charSeq, start, pos); } if (c == LF) { c = 'n'; @@ -1826,8 +1895,17 @@ public final class CSVFormat implements Serializable { c = 'r'; } - out.append(escape); - out.append(c); + appendable.append(escape); + appendable.append(c); + + if (isDelimiterStart) { + for (int i = 1; i < delimLength; i++) { + pos++; + c = charSeq.charAt(pos); + appendable.append(escape); + appendable.append(c); + } + } start = pos + 1; // start on the current char after this one } @@ -1836,7 +1914,7 @@ public final class CSVFormat implements Serializable { // write last segment if (pos > start) { - out.append(value, start, pos); + appendable.append(charSeq, start, pos); } } @@ -1844,14 +1922,19 @@ public final class CSVFormat implements Serializable { int start = 0; int pos = 0; - final char delim = getDelimiter(); + @SuppressWarnings("resource") // Temp reader on input reader. + final ExtendedBufferedReader bufferedReader = new ExtendedBufferedReader(reader); + final char[] delim = getDelimiterString().toCharArray(); + final int delimLength = delim.length; final char escape = getEscapeCharacter().charValue(); final StringBuilder builder = new StringBuilder(IOUtils.DEFAULT_BUFFER_SIZE); int c; - while (-1 != (c = reader.read())) { + while (-1 != (c = bufferedReader.read())) { builder.append((char) c); - if (c == CR || c == LF || c == delim || c == escape) { + boolean isDelimiterStart = isDelimiter((char) c, builder.toString() + new String(bufferedReader.lookAhead(delimLength - 1)), pos, delim, + delimLength); + if (c == CR || c == LF || c == escape || isDelimiterStart) { // write out segment up until this char if (pos > start) { out.append(builder.substring(start, pos)); @@ -1867,6 +1950,14 @@ public final class CSVFormat implements Serializable { out.append(escape); out.append((char) c); + if (isDelimiterStart) { + for (int i = 1; i < delimLength; i++) { + c = bufferedReader.read(); + out.append(escape); + out.append((char) c); + } + } + start = pos + 1; // start on the current char after this one } pos++; @@ -1882,13 +1973,14 @@ public final class CSVFormat implements Serializable { * Note: must only be called if quoting is enabled, otherwise will generate NPE */ // the original object is needed so can check for Number - private void printWithQuotes(final Object object, final CharSequence value, final Appendable out, final boolean newRecord) throws IOException { + private void printWithQuotes(final Object object, final CharSequence charSeq, final Appendable out, final boolean newRecord) throws IOException { boolean quote = false; int start = 0; int pos = 0; - final int len = value.length(); + final int len = charSeq.length(); - final char delimChar = getDelimiter(); + final char[] delim = getDelimiterString().toCharArray(); + final int delimLength = delim.length; final char quoteChar = getQuoteCharacter().charValue(); // If escape char not specified, default to the quote char // This avoids having to keep checking whether there is an escape character @@ -1909,7 +2001,7 @@ public final class CSVFormat implements Serializable { break; case NONE: // Use the existing escaping code - printWithEscapes(value, out); + printWithEscapes(charSeq, out); return; case MINIMAL: if (len <= 0) { @@ -1921,7 +2013,7 @@ public final class CSVFormat implements Serializable { quote = true; } } else { - char c = value.charAt(pos); + char c = charSeq.charAt(pos); if (c <= COMMENT) { // Some other chars at the start of a value caused the parser to fail, so for now @@ -1930,8 +2022,8 @@ public final class CSVFormat implements Serializable { quote = true; } else { while (pos < len) { - c = value.charAt(pos); - if (c == LF || c == CR || c == quoteChar || c == delimChar || c == escapeChar) { + c = charSeq.charAt(pos); + if (c == LF || c == CR || c == quoteChar || c == escapeChar || isDelimiter(c, charSeq, pos, delim, delimLength)) { quote = true; break; } @@ -1940,7 +2032,7 @@ public final class CSVFormat implements Serializable { if (!quote) { pos = len - 1; - c = value.charAt(pos); + c = charSeq.charAt(pos); // Some other chars at the end caused the parser to fail, so for now // encapsulate if we end in anything less than ' ' if (c <= SP) { @@ -1952,7 +2044,7 @@ public final class CSVFormat implements Serializable { if (!quote) { // no encapsulation needed - write out the original value - out.append(value, start, len); + out.append(charSeq, start, len); return; } break; @@ -1962,7 +2054,7 @@ public final class CSVFormat implements Serializable { if (!quote) { // no encapsulation needed - write out the original value - out.append(value, start, len); + out.append(charSeq, start, len); return; } @@ -1972,10 +2064,10 @@ public final class CSVFormat implements Serializable { // Pick up where we left off: pos should be positioned on the first character that caused // the need for encapsulation. while (pos < len) { - final char c = value.charAt(pos); + final char c = charSeq.charAt(pos); if (c == quoteChar || c == escapeChar) { // write out the chunk up until this point - out.append(value, start, pos); + out.append(charSeq, start, pos); out.append(escapeChar); // now output the escape start = pos; // and restart with the matched char } @@ -1983,7 +2075,7 @@ public final class CSVFormat implements Serializable { } // write the last segment - out.append(value, start, pos); + out.append(charSeq, start, pos); out.append(quoteChar); } @@ -1992,10 +2084,10 @@ public final class CSVFormat implements Serializable { * * @throws IOException If an I/O error occurs */ - private void printWithQuotes(final Reader reader, final Appendable out) throws IOException { + private void printWithQuotes(final Reader reader, final Appendable appendable) throws IOException { if (getQuoteMode() == QuoteMode.NONE) { - printWithEscapes(reader, out); + printWithEscapes(reader, appendable); return; } @@ -2004,7 +2096,7 @@ public final class CSVFormat implements Serializable { final char quote = getQuoteCharacter().charValue(); final StringBuilder builder = new StringBuilder(IOUtils.DEFAULT_BUFFER_SIZE); - out.append(quote); + appendable.append(quote); int c; while (-1 != (c = reader.read())) { @@ -2012,23 +2104,23 @@ public final class CSVFormat implements Serializable { if (c == quote) { // write out segment up until this char if (pos > 0) { - out.append(builder.substring(0, pos)); + appendable.append(builder.substring(0, pos)); builder.setLength(0); pos = -1; } - out.append(quote); - out.append((char) c); + appendable.append(quote); + appendable.append((char) c); } pos++; } // write last segment if (pos > 0) { - out.append(builder.substring(0, pos)); + appendable.append(builder.substring(0, pos)); } - out.append(quote); + appendable.append(quote); } @Override @@ -2086,19 +2178,19 @@ public final class CSVFormat implements Serializable { * @throws IllegalArgumentException Throw when any attribute is invalid or inconsistent with other attributes. */ private void validate() throws IllegalArgumentException { - if (isLineBreak(delimiter)) { + if (containsLineBreak(delimiter)) { throw new IllegalArgumentException("The delimiter cannot be a line break"); } - if (quoteCharacter != null && delimiter == quoteCharacter.charValue()) { + if (quoteCharacter != null && contains(delimiter, quoteCharacter.charValue())) { throw new IllegalArgumentException("The quoteChar character and the delimiter cannot be the same ('" + quoteCharacter + "')"); } - if (escapeCharacter != null && delimiter == escapeCharacter.charValue()) { + if (escapeCharacter != null && contains(delimiter, escapeCharacter.charValue())) { throw new IllegalArgumentException("The escape character and the delimiter cannot be the same ('" + escapeCharacter + "')"); } - if (commentMarker != null && delimiter == commentMarker.charValue()) { + if (commentMarker != null && contains(delimiter, commentMarker.charValue())) { throw new IllegalArgumentException("The comment start character and the delimiter cannot be the same ('" + commentMarker + "')"); } diff --git a/src/main/java/org/apache/commons/csv/Constants.java b/src/main/java/org/apache/commons/csv/Constants.java index a4d4d67..e8f0106 100644 --- a/src/main/java/org/apache/commons/csv/Constants.java +++ b/src/main/java/org/apache/commons/csv/Constants.java @@ -26,7 +26,7 @@ final class Constants { static final char BACKSPACE = '\b'; - static final char COMMA = ','; + static final String COMMA = ","; /** * Starts a comment, the remainder of the line is the comment. diff --git a/src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java b/src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java index 6668611..9a7243d 100644 --- a/src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java +++ b/src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java @@ -112,7 +112,7 @@ final class ExtendedBufferedReader extends BufferedReader { * @return the next character * * @throws IOException - * if there is an error in reading + * If an I/O error occurs */ int lookAhead() throws IOException { super.mark(1); @@ -122,6 +122,23 @@ final class ExtendedBufferedReader extends BufferedReader { return c; } + /** + * Returns the next n characters in the current reader without consuming them. The next call to {@link #read()} will still return the next value. This + * doesn't affect line number or last character. + * + * @param n the number characters look ahead. + * @return the next n characters. + * @throws IOException If an I/O error occurs + */ + char[] lookAhead(final int n) throws IOException { + final char[] buf = new char[n]; + super.mark(n); + super.read(buf, 0, n); + super.reset(); + + return buf; + } + @Override public int read() throws IOException { final int current = super.read(); @@ -130,7 +147,7 @@ final class ExtendedBufferedReader extends BufferedReader { eolCounter++; } lastChar = current; - this.position++; + position++; return lastChar; } diff --git a/src/main/java/org/apache/commons/csv/Lexer.java b/src/main/java/org/apache/commons/csv/Lexer.java index 46e9791..a0c75aa 100644 --- a/src/main/java/org/apache/commons/csv/Lexer.java +++ b/src/main/java/org/apache/commons/csv/Lexer.java @@ -48,7 +48,7 @@ final class Lexer implements Closeable { */ private static final char DISABLED = '\ufffe'; - private final char delimiter; + private final char[] delimiter; private final char escape; private final char quoteChar; private final char commentStart; @@ -62,7 +62,7 @@ final class Lexer implements Closeable { Lexer(final CSVFormat format, final ExtendedBufferedReader reader) { this.reader = reader; - this.delimiter = format.getDelimiter(); + this.delimiter = format.getDelimiterString().toCharArray(); this.escape = mapNullToDisabled(format.getEscapeCharacter()); this.quoteChar = mapNullToDisabled(format.getQuoteCharacter()); this.commentStart = mapNullToDisabled(format.getCommentMarker()); @@ -111,26 +111,72 @@ final class Lexer implements Closeable { return ch == commentStart; } - boolean isDelimiter(final int ch) { - return ch == delimiter; + /** + * Determine whether the next characters constitute a delimiter through {@link ExtendedBufferedReader#lookAhead(int)} + * + * @param ch + * the current character. + * @return true if the next characters constitute a delimiter. + * @throws IOException If an I/O error occurs. + */ + boolean isDelimiter(final int ch) throws IOException { + if (ch != delimiter[0]) { + return false; + } + final int len = delimiter.length - 1; + final char[] buf = reader.lookAhead(len); + for (int i = 0; i < len; i++) { + if (buf[i] != delimiter[i+1]) { + return false; + } + } + final int count = reader.read(buf, 0, len); + return count != END_OF_STREAM; } /** - * @return true if the given character indicates end of file + * Tests if the given character indicates end of file. + * + * @return true if the given character indicates end of file. */ boolean isEndOfFile(final int ch) { return ch == END_OF_STREAM; } + /** + * Tests if the given character is the escape character. + * + * @return true if the given character is the escape character. + */ boolean isEscape(final int ch) { return ch == escape; } + /** + * Tests if the next characters constitute a escape delimiter through {@link ExtendedBufferedReader#lookAhead(int)}. + * + * For example, for delimiter "[|]" and escape '!', return true if the next characters constitute "![!|!]". + * + * @return true if the next characters constitute a escape delimiter. + * @throws IOException If an I/O error occurs. + */ + boolean isEscapeDelimiter() throws IOException { + final int len = 2 * delimiter.length - 1; + final char[] buf = reader.lookAhead(len); + if (buf[0] != delimiter[0]) { + return false; + } + for (int i = 1; i < delimiter.length; i++) { + if (buf[2 * i] != delimiter[i] || buf[2 * i - 1] != escape) { + return false; + } + } + final int count = reader.read(buf, 0, len); + return count != END_OF_STREAM; + } + private boolean isMetaChar(final int ch) { - return ch == delimiter || - ch == escape || - ch == quoteChar || - ch == commentStart; + return ch == escape || ch == quoteChar || ch == commentStart; } boolean isQuoteChar(final int ch) { @@ -138,7 +184,7 @@ final class Lexer implements Closeable { } /** - * Checks if the current character represents the start of a line: a CR, LF or is at the start of the file. + * Tests if the current character represents the start of a line: a CR, LF or is at the start of the file. * * @param ch the character to check * @return true if the character is at the start of a line. @@ -148,9 +194,12 @@ final class Lexer implements Closeable { } /** - * @return true if the given char is a whitespace character + * Tests if the given char is a whitespace character. + * + * @return true if the given char is a whitespace character. + * @throws IOException If an I/O error occurs. */ - boolean isWhitespace(final int ch) { + boolean isWhitespace(final int ch) throws IOException { return !isDelimiter(ch) && Character.isWhitespace((char) ch); } @@ -166,9 +215,8 @@ final class Lexer implements Closeable { * * @param token * an existing Token object to reuse. The caller is responsible to initialize the Token. - * @return the next token found - * @throws java.io.IOException - * on stream access error + * @return the next token found. + * @throws java.io.IOException on stream access error. */ Token nextToken(final Token token) throws IOException { @@ -256,10 +304,11 @@ final class Lexer implements Closeable { /** * Parses an encapsulated token. - * <p/> + * <p> * Encapsulated tokens are surrounded by the given encapsulating-string. The encapsulator itself might be included * in the token using a doubling syntax (as "", '') or using escaping (as in \", \'). Whitespaces before and after * an encapsulated token are ignored. The token is finished when one of the following conditions become true: + * </p> * <ul> * <li>an unescaped encapsulator has been reached, and is followed by optional whitespace then:</li> * <ul> @@ -282,11 +331,15 @@ final class Lexer implements Closeable { c = reader.read(); if (isEscape(c)) { - final int unescaped = readEscape(); - if (unescaped == END_OF_STREAM) { // unexpected char after escape - token.content.append((char) c).append((char) reader.getLastChar()); + if (isEscapeDelimiter()) { + token.content.append(delimiter); } else { - token.content.append((char) unescaped); + final int unescaped = readEscape(); + if (unescaped == END_OF_STREAM) { // unexpected char after escape + token.content.append((char) c).append((char) reader.getLastChar()); + } else { + token.content.append((char) unescaped); + } } } else if (isQuoteChar(c)) { if (isQuoteChar(reader.lookAhead())) { @@ -330,9 +383,10 @@ final class Lexer implements Closeable { /** * Parses a simple token. - * <p/> + * <p> * Simple token are tokens which are not surrounded by encapsulators. A simple token might contain escaped * delimiters (as \, or \;). The token is finished when one of the following conditions become true: + * </p> * <ul> * <li>end of line has been reached (EORECORD)</li> * <li>end of stream has been reached (EOF)</li> @@ -364,11 +418,15 @@ final class Lexer implements Closeable { break; } if (isEscape(ch)) { - final int unescaped = readEscape(); - if (unescaped == END_OF_STREAM) { // unexpected char after escape - token.content.append((char) ch).append((char) reader.getLastChar()); + if (isEscapeDelimiter()) { + token.content.append(delimiter); } else { - token.content.append((char) unescaped); + final int unescaped = readEscape(); + if (unescaped == END_OF_STREAM) { // unexpected char after escape + token.content.append((char) ch).append((char) reader.getLastChar()); + } else { + token.content.append((char) unescaped); + } } ch = reader.read(); // continue } else { diff --git a/src/site/resources/pmd/pmd-ruleset.xml b/src/site/resources/pmd/pmd-ruleset.xml index 21bae90..17a8fab 100644 --- a/src/site/resources/pmd/pmd-ruleset.xml +++ b/src/site/resources/pmd/pmd-ruleset.xml @@ -15,7 +15,7 @@ See the License for the specific language governing permissions and limitations under the License. --> -<ruleset name="commons-rng-customized" +<ruleset name="commons-csv-customized" xmlns="http://pmd.sourceforge.net/ruleset/2.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://pmd.sourceforge.net/ruleset/2.0.0 http://pmd.sourceforge.net/ruleset_2_0_0.xsd"> @@ -35,7 +35,8 @@ <rule ref="category/java/codestyle.xml/DuplicateImports"/> <rule ref="category/java/codestyle.xml/ExtendsObject"/> <rule ref="category/java/codestyle.xml/ForLoopShouldBeWhileLoop"/> - <rule ref="category/java/codestyle.xml/TooManyStaticImports"/> + <!-- See custom rules --> + <!-- <rule ref="category/java/codestyle.xml/TooManyStaticImports"/> --> <rule ref="category/java/codestyle.xml/UnnecessaryFullyQualifiedName"/> <rule ref="category/java/codestyle.xml/UnnecessaryModifier"/> <rule ref="category/java/codestyle.xml/UnnecessaryReturn"/> @@ -82,9 +83,7 @@ <rule ref="category/java/codestyle.xml/TooManyStaticImports"> <properties> <property name="violationSuppressXPath" - value="//ClassOrInterfaceDeclaration[.[typeIs('org.apache.commons.csv.CSVFormat')]]" /> - <property name="violationSuppressXPath" - value="//ClassOrInterfaceDeclaration[.[typeIs('org.apache.commons.csv.Lexer')]]" /> + value="//ClassOrInterfaceDeclaration[contains(@Name, 'org.apache.commons.csv.CSVFormat')] or //ClassOrInterfaceDeclaration[contains(@SimpleName, 'org.apache.commons.csv.Lexer')]" /> </properties> </rule> diff --git a/src/test/java/org/apache/commons/csv/CSVParserTest.java b/src/test/java/org/apache/commons/csv/CSVParserTest.java index 02d8c6c..392d82e 100644 --- a/src/test/java/org/apache/commons/csv/CSVParserTest.java +++ b/src/test/java/org/apache/commons/csv/CSVParserTest.java @@ -90,6 +90,84 @@ public class CSVParserTest { } @Test + public void testParseWithDelimiterWithQuote() throws IOException { + String source = "'a,b,c',xyz"; + CSVFormat csvFormat = CSVFormat.DEFAULT.withQuote('\''); + try (CSVParser csvParser = csvFormat.parse(new StringReader(source))) { + CSVRecord csvRecord = csvParser.nextRecord(); + assertEquals("a,b,c", csvRecord.get(0)); + assertEquals("xyz", csvRecord.get(1)); + } + } + + @Test + public void testParseWithDelimiterStringWithQuote() throws IOException { + String source = "'a[|]b[|]c'[|]xyz\r\nabc[abc][|]xyz"; + CSVFormat csvFormat = CSVFormat.DEFAULT.builder().setDelimiter("[|]").setQuote('\'').build(); + try (CSVParser csvParser = csvFormat.parse(new StringReader(source))) { + CSVRecord csvRecord = csvParser.nextRecord(); + assertEquals("a[|]b[|]c", csvRecord.get(0)); + assertEquals("xyz", csvRecord.get(1)); + csvRecord = csvParser.nextRecord(); + assertEquals("abc[abc]", csvRecord.get(0)); + assertEquals("xyz", csvRecord.get(1)); + } + } + + @Test + public void testParseWithDelimiterWithEscape() throws IOException { + String source = "a!,b!,c,xyz"; + CSVFormat csvFormat = CSVFormat.DEFAULT.withEscape('!'); + try (CSVParser csvParser = csvFormat.parse(new StringReader(source))) { + CSVRecord csvRecord = csvParser.nextRecord(); + assertEquals("a,b,c", csvRecord.get(0)); + assertEquals("xyz", csvRecord.get(1)); + } + } + + @Test + public void testParseWithDelimiterStringWithEscape() throws IOException { + String source = "a![!|!]b![|]c[|]xyz\r\nabc[abc][|]xyz"; + CSVFormat csvFormat = CSVFormat.DEFAULT.builder().setDelimiter("[|]").setEscape('!').build(); + try (CSVParser csvParser = csvFormat.parse(new StringReader(source))) { + CSVRecord csvRecord = csvParser.nextRecord(); + assertEquals("a[|]b![|]c", csvRecord.get(0)); + assertEquals("xyz", csvRecord.get(1)); + csvRecord = csvParser.nextRecord(); + assertEquals("abc[abc]", csvRecord.get(0)); + assertEquals("xyz", csvRecord.get(1)); + } + } + + @Test + public void testParseWithQuoteWithEscape() throws IOException { + String source = "'a?,b?,c?d',xyz"; + CSVFormat csvFormat = CSVFormat.DEFAULT.withQuote('\'').withEscape('?'); + try (CSVParser csvParser = csvFormat.parse(new StringReader(source))) { + CSVRecord csvRecord = csvParser.nextRecord(); + assertEquals("a,b,c?d", csvRecord.get(0)); + assertEquals("xyz", csvRecord.get(1)); + } + } + + @Test + public void testParseWithQuoteThrowsException() { + CSVFormat csvFormat = CSVFormat.DEFAULT.withQuote('\''); + assertThrows(IOException.class, () -> csvFormat.parse(new StringReader("'a,b,c','")).nextRecord()); + assertThrows(IOException.class, () -> csvFormat.parse(new StringReader("'a,b,c'abc,xyz")).nextRecord()); + assertThrows(IOException.class, () -> csvFormat.parse(new StringReader("'abc'a,b,c',xyz")).nextRecord()); + } + + @Test + public void testNotValueCSV() throws IOException { + String source = "#"; + CSVFormat csvFormat = CSVFormat.DEFAULT.withCommentMarker('#'); + CSVParser csvParser = csvFormat.parse(new StringReader(source)); + CSVRecord csvRecord = csvParser.nextRecord(); + assertNull(csvRecord); + } + + @Test public void testBackslashEscaping() throws IOException { // To avoid confusion over the need for escaping chars in java code, diff --git a/src/test/java/org/apache/commons/csv/CSVPrinterTest.java b/src/test/java/org/apache/commons/csv/CSVPrinterTest.java index 5cef951..c470209 100644 --- a/src/test/java/org/apache/commons/csv/CSVPrinterTest.java +++ b/src/test/java/org/apache/commons/csv/CSVPrinterTest.java @@ -17,6 +17,7 @@ package org.apache.commons.csv; +import static org.apache.commons.csv.Constants.BACKSLASH; import static org.apache.commons.csv.Constants.CR; import static org.junit.jupiter.api.Assertions.assertArrayEquals; import static org.junit.jupiter.api.Assertions.assertEquals; @@ -67,7 +68,6 @@ import org.junit.jupiter.api.Test; */ public class CSVPrinterTest { - private static final char BACKSLASH_CH = '\\'; private static final char DQUOTE_CHAR = '"'; private static final char EURO_CH = '\u20AC'; private static final int ITERATIONS_FOR_RANDOM_TEST = 50000; @@ -161,11 +161,14 @@ public class CSVPrinterTest { private CSVPrinter printWithHeaderComments(final StringWriter sw, final Date now, final CSVFormat baseFormat) throws IOException { - CSVFormat format = baseFormat; // Use withHeaderComments first to test CSV-145 - format = format.withHeaderComments("Generated by Apache Commons CSV 1.1", now); - format = format.withCommentMarker('#'); - format = format.withHeader("Col1", "Col2"); + // @formatter:off + CSVFormat format = baseFormat.builder() + .setHeaderComments("Generated by Apache Commons CSV 1.1", now) + .setCommentMarker('#') + .setHeader("Col1", "Col2") + .build(); + // @formatter:on final CSVPrinter csvPrinter = format.print(sw); csvPrinter.printRecord("A", "B"); csvPrinter.printRecord("C", "D"); @@ -209,7 +212,7 @@ public class CSVPrinterTest { ch = '\''; break; case 8: - ch = BACKSLASH_CH; + ch = BACKSLASH; break; default: ch = (char) r.nextInt(300); @@ -357,6 +360,28 @@ public class CSVPrinterTest { } @Test + public void testDelimeterStringQuoted() throws IOException { + final StringWriter sw = new StringWriter(); + try (final CSVPrinter printer = new CSVPrinter(sw, CSVFormat.DEFAULT.builder().setDelimiter("[|]").setQuote('\'').build())) { + printer.print("a[|]b[|]c"); + printer.print("xyz"); + assertEquals("'a[|]b[|]c'[|]xyz", sw.toString()); + } + } + + @Test + public void testDelimeterStringQuoteNone() throws IOException { + final StringWriter sw = new StringWriter(); + final CSVFormat format = CSVFormat.DEFAULT.builder().setDelimiter("[|]").setEscape('!').setQuoteMode(QuoteMode.NONE).build(); + try (final CSVPrinter printer = new CSVPrinter(sw, format)) { + printer.print("a[|]b[|]c"); + printer.print("xyz"); + printer.print("a[xy]bc[]"); + assertEquals("a![!|!]b![!|!]c[|]xyz[|]a[xy]bc[]", sw.toString()); + } + } + + @Test public void testDelimiterEscaped() throws IOException { final StringWriter sw = new StringWriter(); try (final CSVPrinter printer = new CSVPrinter(sw, CSVFormat.DEFAULT.withEscape('!').withQuote(null))) { @@ -377,6 +402,16 @@ public class CSVPrinterTest { } @Test + public void testDelimiterStringEscaped() throws IOException { + final StringWriter sw = new StringWriter(); + try (final CSVPrinter printer = new CSVPrinter(sw, CSVFormat.DEFAULT.builder().setDelimiter("|||").setEscape('!').setQuote(null).build())) { + printer.print("a|||b|||c"); + printer.print("xyz"); + assertEquals("a!|!|!|b!|!|!|c|||xyz", sw.toString()); + } + } + + @Test public void testDisabledComment() throws IOException { final StringWriter sw = new StringWriter(); try (final CSVPrinter printer = new CSVPrinter(sw, CSVFormat.DEFAULT)) { @@ -688,7 +723,7 @@ public class CSVPrinterTest { @Test @Disabled public void testJira135_part1() throws IOException { - final CSVFormat format = CSVFormat.DEFAULT.withRecordSeparator('\n').withQuote(DQUOTE_CHAR).withEscape(BACKSLASH_CH); + final CSVFormat format = CSVFormat.DEFAULT.withRecordSeparator('\n').withQuote(DQUOTE_CHAR).withEscape(BACKSLASH); final StringWriter sw = new StringWriter(); final List<String> list = new LinkedList<>(); try (final CSVPrinter printer = new CSVPrinter(sw, format)) { @@ -704,7 +739,7 @@ public class CSVPrinterTest { @Test @Disabled public void testJira135_part2() throws IOException { - final CSVFormat format = CSVFormat.DEFAULT.withRecordSeparator('\n').withQuote(DQUOTE_CHAR).withEscape(BACKSLASH_CH); + final CSVFormat format = CSVFormat.DEFAULT.withRecordSeparator('\n').withQuote(DQUOTE_CHAR).withEscape(BACKSLASH); final StringWriter sw = new StringWriter(); final List<String> list = new LinkedList<>(); try (final CSVPrinter printer = new CSVPrinter(sw, format)) { @@ -720,7 +755,7 @@ public class CSVPrinterTest { @Test @Disabled public void testJira135_part3() throws IOException { - final CSVFormat format = CSVFormat.DEFAULT.withRecordSeparator('\n').withQuote(DQUOTE_CHAR).withEscape(BACKSLASH_CH); + final CSVFormat format = CSVFormat.DEFAULT.withRecordSeparator('\n').withQuote(DQUOTE_CHAR).withEscape(BACKSLASH); final StringWriter sw = new StringWriter(); final List<String> list = new LinkedList<>(); try (final CSVPrinter printer = new CSVPrinter(sw, format)) { @@ -736,7 +771,7 @@ public class CSVPrinterTest { @Test @Disabled public void testJira135All() throws IOException { - final CSVFormat format = CSVFormat.DEFAULT.withRecordSeparator('\n').withQuote(DQUOTE_CHAR).withEscape(BACKSLASH_CH); + final CSVFormat format = CSVFormat.DEFAULT.withRecordSeparator('\n').withQuote(DQUOTE_CHAR).withEscape(BACKSLASH); final StringWriter sw = new StringWriter(); final List<String> list = new LinkedList<>(); try (final CSVPrinter printer = new CSVPrinter(sw, format)) { diff --git a/src/test/java/org/apache/commons/csv/issues/JiraCsv206Test.java b/src/test/java/org/apache/commons/csv/issues/JiraCsv206Test.java new file mode 100644 index 0000000..57f1149 --- /dev/null +++ b/src/test/java/org/apache/commons/csv/issues/JiraCsv206Test.java @@ -0,0 +1,69 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.csv.issues; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +import java.io.IOException; +import java.io.StringReader; +import java.util.Iterator; + +import org.apache.commons.csv.CSVFormat; +import org.apache.commons.csv.CSVParser; +import org.apache.commons.csv.CSVPrinter; +import org.apache.commons.csv.CSVRecord; +import org.junit.jupiter.api.Test; + +public class JiraCsv206Test { + + @Test + public void testJiraCsv206MultipleCharacterDelimiter() throws IOException { + // Read with multiple character delimiter + final String source = "FirstName[|]LastName[|]Address\r\nJohn[|]Smith[|]123 Main St."; + final StringReader reader = new StringReader(source); + final CSVFormat csvFormat = CSVFormat.DEFAULT.builder().setDelimiter("[|]").build(); + CSVRecord record = null; + try (final CSVParser csvParser = new CSVParser(reader, csvFormat)) { + Iterator<CSVRecord> iterator = csvParser.iterator(); + record = iterator.next(); + assertEquals("FirstName", record.get(0)); + assertEquals("LastName", record.get(1)); + assertEquals("Address", record.get(2)); + record = iterator.next(); + assertEquals("John", record.get(0)); + assertEquals("Smith", record.get(1)); + assertEquals("123 Main St.", record.get(2)); + } + // Write with multiple character delimiter + final String outString = "# Change delimiter to [I]\r\n" + "first name[I]last name[I]address\r\n" + "John[I]Smith[I]123 Main St."; + final String comment = "Change delimiter to [I]"; + // @formatter:off + final CSVFormat format = CSVFormat.EXCEL.builder() + .setDelimiter("[I]").setHeader("first name", "last name", "address") + .setCommentMarker('#') + .setHeaderComments(comment).build(); + // @formatter:off + final StringBuilder out = new StringBuilder(); + try (final CSVPrinter printer = format.print(out)) { + printer.print(record.get(0)); + printer.print(record.get(1)); + printer.print(record.get(2)); + } + final String s = out.toString(); + assertEquals(outString, s); + } +} \ No newline at end of file