This is an automated email from the ASF dual-hosted git repository. ggregory pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/commons-csv.git
commit 76981db68af50e0475a2d5d0fcc6bdf87efb18f1 Author: Gary Gregory <garydgreg...@gmail.com> AuthorDate: Thu Jan 2 15:14:13 2025 -0500 Sort members --- .../java/org/apache/commons/csv/CSVParser.java | 24 ++-- .../java/org/apache/commons/csv/CSVRecord.java | 20 +-- .../apache/commons/csv/ExtendedBufferedReader.java | 108 ++++++++-------- src/main/java/org/apache/commons/csv/Lexer.java | 16 +-- .../java/org/apache/commons/csv/CSVParserTest.java | 140 ++++++++++----------- 5 files changed, 154 insertions(+), 154 deletions(-) diff --git a/src/main/java/org/apache/commons/csv/CSVParser.java b/src/main/java/org/apache/commons/csv/CSVParser.java index 07028ea7..a7067657 100644 --- a/src/main/java/org/apache/commons/csv/CSVParser.java +++ b/src/main/java/org/apache/commons/csv/CSVParser.java @@ -181,6 +181,18 @@ public final class CSVParser implements Iterable<CSVRecord>, Closeable { return asThis(); } + /** + * Sets whether to enable byte tracking for the parser. + * + * @param enableByteTracking {@code true} to enable byte tracking; {@code false} to disable it. + * @return this instance. + * @since 1.13.0 + */ + public Builder setEnableByteTracking(final boolean enableByteTracking) { + this.enableByteTracking = enableByteTracking; + return asThis(); + } + /** * Sets the CSV format. A copy of the given format is kept. * @@ -203,18 +215,6 @@ public final class CSVParser implements Iterable<CSVRecord>, Closeable { return asThis(); } - /** - * Sets whether to enable byte tracking for the parser. - * - * @param enableByteTracking {@code true} to enable byte tracking; {@code false} to disable it. - * @return this instance. - * @since 1.13.0 - */ - public Builder setEnableByteTracking(final boolean enableByteTracking) { - this.enableByteTracking = enableByteTracking; - return asThis(); - } - } final class CSVRecordIterator implements Iterator<CSVRecord> { diff --git a/src/main/java/org/apache/commons/csv/CSVRecord.java b/src/main/java/org/apache/commons/csv/CSVRecord.java index 284220c3..689cd0a2 100644 --- a/src/main/java/org/apache/commons/csv/CSVRecord.java +++ b/src/main/java/org/apache/commons/csv/CSVRecord.java @@ -141,16 +141,6 @@ public final class CSVRecord implements Serializable, Iterable<String> { } } - /** - * Returns the start position of this record as a character position in the source stream. This may or may not - * correspond to the byte position depending on the character set. - * - * @return the position of this record in the source stream. - */ - public long getCharacterPosition() { - return characterPosition; - } - /** * Returns the starting position of this record in the source stream, measured in bytes. * @@ -161,6 +151,16 @@ public final class CSVRecord implements Serializable, Iterable<String> { return bytePosition; } + /** + * Returns the start position of this record as a character position in the source stream. This may or may not + * correspond to the byte position depending on the character set. + * + * @return the position of this record in the source stream. + */ + public long getCharacterPosition() { + return characterPosition; + } + /** * Returns the comment for this record, if any. * Note that comments are attached to the following record. diff --git a/src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java b/src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java index 6043ccaf..31890db8 100644 --- a/src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java +++ b/src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java @@ -98,6 +98,60 @@ final class ExtendedBufferedReader extends UnsynchronizedBufferedReader { super.close(); } + /** + * Gets the number of bytes read by the reader. + * + * @return the number of bytes read by the read + */ + long getBytesRead() { + return this.bytesRead; + } + + /** + * Gets the byte length of the given character based on the the original Unicode + * specification, which defined characters as fixed-width 16-bit entities. + * <p> + * The Unicode characters are divided into two main ranges: + * <ul> + * <li><b>U+0000 to U+FFFF (Basic Multilingual Plane, BMP):</b> + * <ul> + * <li>Represented using a single 16-bit {@code char}.</li> + * <li>Includes UTF-8 encodings of 1-byte, 2-byte, and some 3-byte characters.</li> + * </ul> + * </li> + * <li><b>U+10000 to U+10FFFF (Supplementary Characters):</b> + * <ul> + * <li>Represented as a pair of {@code char}s:</li> + * <li>The first {@code char} is from the high-surrogates range (\uD800-\uDBFF).</li> + * <li>The second {@code char} is from the low-surrogates range (\uDC00-\uDFFF).</li> + * <li>Includes UTF-8 encodings of some 3-byte characters and all 4-byte characters.</li> + * </ul> + * </li> + * </ul> + * + * @param current the current character to process. + * @return the byte length of the character. + * @throws CharacterCodingException if the character cannot be encoded. + */ + private int getEncodedCharLength(int current) throws CharacterCodingException { + final char cChar = (char) current; + final char lChar = (char) lastChar; + if (!Character.isSurrogate(cChar)) { + return encoder.encode( + CharBuffer.wrap(new char[] {cChar})).limit(); + } else { + if (Character.isHighSurrogate(cChar)) { + // Move on to the next char (low surrogate) + return 0; + } else if (Character.isSurrogatePair(lChar, cChar)) { + return encoder.encode( + CharBuffer.wrap(new char[] {lChar, cChar})).limit(); + } else { + throw new CharacterCodingException(); + } + } + } + /** * Returns the last character that was read as an integer (0 to 65535). This will be the last character returned by * any of the read methods. This will not include a character read using the {@link #peek()} method. If no @@ -156,51 +210,6 @@ final class ExtendedBufferedReader extends UnsynchronizedBufferedReader { return lastChar; } - /** - * Gets the byte length of the given character based on the the original Unicode - * specification, which defined characters as fixed-width 16-bit entities. - * <p> - * The Unicode characters are divided into two main ranges: - * <ul> - * <li><b>U+0000 to U+FFFF (Basic Multilingual Plane, BMP):</b> - * <ul> - * <li>Represented using a single 16-bit {@code char}.</li> - * <li>Includes UTF-8 encodings of 1-byte, 2-byte, and some 3-byte characters.</li> - * </ul> - * </li> - * <li><b>U+10000 to U+10FFFF (Supplementary Characters):</b> - * <ul> - * <li>Represented as a pair of {@code char}s:</li> - * <li>The first {@code char} is from the high-surrogates range (\uD800-\uDBFF).</li> - * <li>The second {@code char} is from the low-surrogates range (\uDC00-\uDFFF).</li> - * <li>Includes UTF-8 encodings of some 3-byte characters and all 4-byte characters.</li> - * </ul> - * </li> - * </ul> - * - * @param current the current character to process. - * @return the byte length of the character. - * @throws CharacterCodingException if the character cannot be encoded. - */ - private int getEncodedCharLength(int current) throws CharacterCodingException { - final char cChar = (char) current; - final char lChar = (char) lastChar; - if (!Character.isSurrogate(cChar)) { - return encoder.encode( - CharBuffer.wrap(new char[] {cChar})).limit(); - } else { - if (Character.isHighSurrogate(cChar)) { - // Move on to the next char (low surrogate) - return 0; - } else if (Character.isSurrogatePair(lChar, cChar)) { - return encoder.encode( - CharBuffer.wrap(new char[] {lChar, cChar})).limit(); - } else { - throw new CharacterCodingException(); - } - } - } - @Override public int read(final char[] buf, final int offset, final int length) throws IOException { if (length == 0) { @@ -269,13 +278,4 @@ final class ExtendedBufferedReader extends UnsynchronizedBufferedReader { super.reset(); } - /** - * Gets the number of bytes read by the reader. - * - * @return the number of bytes read by the read - */ - long getBytesRead() { - return this.bytesRead; - } - } diff --git a/src/main/java/org/apache/commons/csv/Lexer.java b/src/main/java/org/apache/commons/csv/Lexer.java index 2e7d2d04..2e9e7137 100644 --- a/src/main/java/org/apache/commons/csv/Lexer.java +++ b/src/main/java/org/apache/commons/csv/Lexer.java @@ -97,21 +97,21 @@ final class Lexer implements Closeable { } /** - * Returns the current character position + * Gets the number of bytes read * - * @return the current character position + * @return the number of bytes read */ - long getCharacterPosition() { - return reader.getPosition(); + long getBytesRead() { + return reader.getBytesRead(); } /** - * Gets the number of bytes read + * Returns the current character position * - * @return the number of bytes read + * @return the current character position */ - long getBytesRead() { - return reader.getBytesRead(); + long getCharacterPosition() { + return reader.getPosition(); } /** diff --git a/src/test/java/org/apache/commons/csv/CSVParserTest.java b/src/test/java/org/apache/commons/csv/CSVParserTest.java index c42a3c25..2f508b36 100644 --- a/src/test/java/org/apache/commons/csv/CSVParserTest.java +++ b/src/test/java/org/apache/commons/csv/CSVParserTest.java @@ -703,76 +703,6 @@ public class CSVParserTest { } } - @Test - public void testGetRecordThreeBytesRead() throws Exception { - final String code = "id,date,val5,val4\n" + - "11111111111111,'4017-09-01',きちんと節分近くには咲いてる~,v4\n" + - "22222222222222,'4017-01-01',おはよう私の友人~,v4\n" + - "33333333333333,'4017-01-01',きる自然の力ってすごいな~,v4\n"; - final CSVFormat format = CSVFormat.Builder.create() - .setDelimiter(',') - .setQuote('\'') - .get(); - try (CSVParser parser = CSVParser.builder().setReader(new StringReader(code)).setFormat(format).setCharset(UTF_8).setEnableByteTracking(true).get() ) { - CSVRecord record = new CSVRecord(parser, null, null, 1L, 0L, 0L); - - assertEquals(0, parser.getRecordNumber()); - assertNotNull(record = parser.nextRecord()); - assertEquals(1, record.getRecordNumber()); - assertEquals(code.indexOf('i'), record.getCharacterPosition()); - assertEquals(record.getBytePosition(), record.getCharacterPosition()); - - assertNotNull(record = parser.nextRecord()); - assertEquals(2, record.getRecordNumber()); - assertEquals(code.indexOf('1'), record.getCharacterPosition()); - assertEquals(record.getBytePosition(), record.getCharacterPosition()); - - assertNotNull(record = parser.nextRecord()); - assertEquals(3, record.getRecordNumber()); - assertEquals(code.indexOf('2'), record.getCharacterPosition()); - assertEquals(record.getBytePosition(), 95); - - assertNotNull(record = parser.nextRecord()); - assertEquals(4, record.getRecordNumber()); - assertEquals(code.indexOf('3'), record.getCharacterPosition()); - assertEquals(record.getBytePosition(), 154); - } - } - - @Test - public void testGetRecordFourBytesRead() throws Exception { - final String code = "id,a,b,c\n" + - "1,😊,🤔,😂\n" + - "2,😊,🤔,😂\n" + - "3,😊,🤔,😂\n"; - final CSVFormat format = CSVFormat.Builder.create() - .setDelimiter(',') - .setQuote('\'') - .get(); - try (CSVParser parser = CSVParser.builder().setReader(new StringReader(code)).setFormat(format).setCharset(UTF_8).setEnableByteTracking(true).get()) { - CSVRecord record = new CSVRecord(parser, null, null, 1L, 0L, 0L); - - assertEquals(0, parser.getRecordNumber()); - assertNotNull(record = parser.nextRecord()); - assertEquals(1, record.getRecordNumber()); - assertEquals(code.indexOf('i'), record.getCharacterPosition()); - assertEquals(record.getBytePosition(), record.getCharacterPosition()); - - assertNotNull(record = parser.nextRecord()); - assertEquals(2, record.getRecordNumber()); - assertEquals(code.indexOf('1'), record.getCharacterPosition()); - assertEquals(record.getBytePosition(), record.getCharacterPosition()); - assertNotNull(record = parser.nextRecord()); - assertEquals(3, record.getRecordNumber()); - assertEquals(code.indexOf('2'), record.getCharacterPosition()); - assertEquals(record.getBytePosition(), 26); - assertNotNull(record = parser.nextRecord()); - assertEquals(4, record.getRecordNumber()); - assertEquals(code.indexOf('3'), record.getCharacterPosition()); - assertEquals(record.getBytePosition(), 43); - } - } - @Test public void testGetHeaderMap() throws Exception { try (CSVParser parser = CSVParser.parse("a,b,c\n1,2,3\nx,y,z", CSVFormat.DEFAULT.withHeader("A", "B", "C"))) { @@ -878,6 +808,40 @@ public class CSVParserTest { } } + @Test + public void testGetRecordFourBytesRead() throws Exception { + final String code = "id,a,b,c\n" + + "1,😊,🤔,😂\n" + + "2,😊,🤔,😂\n" + + "3,😊,🤔,😂\n"; + final CSVFormat format = CSVFormat.Builder.create() + .setDelimiter(',') + .setQuote('\'') + .get(); + try (CSVParser parser = CSVParser.builder().setReader(new StringReader(code)).setFormat(format).setCharset(UTF_8).setEnableByteTracking(true).get()) { + CSVRecord record = new CSVRecord(parser, null, null, 1L, 0L, 0L); + + assertEquals(0, parser.getRecordNumber()); + assertNotNull(record = parser.nextRecord()); + assertEquals(1, record.getRecordNumber()); + assertEquals(code.indexOf('i'), record.getCharacterPosition()); + assertEquals(record.getBytePosition(), record.getCharacterPosition()); + + assertNotNull(record = parser.nextRecord()); + assertEquals(2, record.getRecordNumber()); + assertEquals(code.indexOf('1'), record.getCharacterPosition()); + assertEquals(record.getBytePosition(), record.getCharacterPosition()); + assertNotNull(record = parser.nextRecord()); + assertEquals(3, record.getRecordNumber()); + assertEquals(code.indexOf('2'), record.getCharacterPosition()); + assertEquals(record.getBytePosition(), 26); + assertNotNull(record = parser.nextRecord()); + assertEquals(4, record.getRecordNumber()); + assertEquals(code.indexOf('3'), record.getCharacterPosition()); + assertEquals(record.getBytePosition(), 43); + } + } + @Test public void testGetRecordNumberWithCR() throws Exception { validateRecordNumbers(String.valueOf(CR)); @@ -923,6 +887,42 @@ public class CSVParserTest { } + @Test + public void testGetRecordThreeBytesRead() throws Exception { + final String code = "id,date,val5,val4\n" + + "11111111111111,'4017-09-01',きちんと節分近くには咲いてる~,v4\n" + + "22222222222222,'4017-01-01',おはよう私の友人~,v4\n" + + "33333333333333,'4017-01-01',きる自然の力ってすごいな~,v4\n"; + final CSVFormat format = CSVFormat.Builder.create() + .setDelimiter(',') + .setQuote('\'') + .get(); + try (CSVParser parser = CSVParser.builder().setReader(new StringReader(code)).setFormat(format).setCharset(UTF_8).setEnableByteTracking(true).get() ) { + CSVRecord record = new CSVRecord(parser, null, null, 1L, 0L, 0L); + + assertEquals(0, parser.getRecordNumber()); + assertNotNull(record = parser.nextRecord()); + assertEquals(1, record.getRecordNumber()); + assertEquals(code.indexOf('i'), record.getCharacterPosition()); + assertEquals(record.getBytePosition(), record.getCharacterPosition()); + + assertNotNull(record = parser.nextRecord()); + assertEquals(2, record.getRecordNumber()); + assertEquals(code.indexOf('1'), record.getCharacterPosition()); + assertEquals(record.getBytePosition(), record.getCharacterPosition()); + + assertNotNull(record = parser.nextRecord()); + assertEquals(3, record.getRecordNumber()); + assertEquals(code.indexOf('2'), record.getCharacterPosition()); + assertEquals(record.getBytePosition(), 95); + + assertNotNull(record = parser.nextRecord()); + assertEquals(4, record.getRecordNumber()); + assertEquals(code.indexOf('3'), record.getCharacterPosition()); + assertEquals(record.getBytePosition(), 154); + } + } + @Test public void testGetRecordWithMultiLineValues() throws Exception { try (CSVParser parser = CSVParser.parse("\"a\r\n1\",\"a\r\n2\"" + CRLF + "\"b\r\n1\",\"b\r\n2\"" + CRLF + "\"c\r\n1\",\"c\r\n2\"",