This is an automated email from the ASF dual-hosted git repository. ggregory pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/commons-io.git
The following commit(s) were added to refs/heads/master by this push: new c7817ac [IO-638] Infinite loop in CharSequenceInputStream.read for 4-byte characters with UTF-8 and 3-byte buffer. c7817ac is described below commit c7817acf1e125d459af7f3a4aa161fac8454bced Author: Gary Gregory <gardgreg...@gmail.com> AuthorDate: Sun Sep 26 12:12:36 2021 -0400 [IO-638] Infinite loop in CharSequenceInputStream.read for 4-byte characters with UTF-8 and 3-byte buffer. [IO-716] ReaderInputStream enter infinite loop for too small buffer sizes. Use CharsetEncoder#maxBytesPerChar() * 2 the min. --- src/changes/changes.xml | 3 + .../commons/io/input/CharSequenceInputStream.java | 17 ++- .../apache/commons/io/input/ReaderInputStream.java | 129 ++++++++++----------- .../io/input/CharSequenceInputStreamTest.java | 7 +- .../commons/io/input/ReaderInputStreamTest.java | 10 +- 5 files changed, 85 insertions(+), 81 deletions(-) diff --git a/src/changes/changes.xml b/src/changes/changes.xml index da2dee0..f2122fc 100644 --- a/src/changes/changes.xml +++ b/src/changes/changes.xml @@ -98,6 +98,9 @@ The <action> type attribute can be add,update,fix,remove. <action issue="IO-716" dev="ggregory" type="fix" due-to="Marcono1234, Gary Gregory"> ReaderInputStream enter infinite loop for too small buffer sizes. </action> + <action issue="IO-638" dev="ggregory" type="fix" due-to=", Gary Gregory"> + Infinite loop in CharSequenceInputStream.read for 4-byte characters with UTF-8 and 3-byte buffer. + </action> <!-- ADD --> <action dev="ggregory" type="add" due-to="Gary Gregory"> Add BrokenReader.INSTANCE. diff --git a/src/main/java/org/apache/commons/io/input/CharSequenceInputStream.java b/src/main/java/org/apache/commons/io/input/CharSequenceInputStream.java index ddce28c..da45fa1 100644 --- a/src/main/java/org/apache/commons/io/input/CharSequenceInputStream.java +++ b/src/main/java/org/apache/commons/io/input/CharSequenceInputStream.java @@ -45,7 +45,7 @@ public class CharSequenceInputStream extends InputStream { private static final int NO_MARK = -1; - private final CharsetEncoder encoder; + private final CharsetEncoder charsetEncoder; private final CharBuffer cbuf; private final ByteBuffer bbuf; @@ -73,16 +73,13 @@ public class CharSequenceInputStream extends InputStream { * @throws IllegalArgumentException if the buffer is not large enough to hold a complete character */ public CharSequenceInputStream(final CharSequence cs, final Charset charset, final int bufferSize) { - this.encoder = charset.newEncoder() + // @formatter:off + this.charsetEncoder = charset.newEncoder() .onMalformedInput(CodingErrorAction.REPLACE) .onUnmappableCharacter(CodingErrorAction.REPLACE); + // @formatter:on // Ensure that buffer is long enough to hold a complete character - final float maxBytesPerChar = encoder.maxBytesPerChar(); - if (bufferSize < maxBytesPerChar) { - throw new IllegalArgumentException("Buffer size " + bufferSize + " is less than maxBytesPerChar " + - maxBytesPerChar); - } - this.bbuf = ByteBuffer.allocate(bufferSize); + this.bbuf = ByteBuffer.allocate(ReaderInputStream.checkMinBufferSize(charsetEncoder, bufferSize)); this.bbuf.flip(); this.cbuf = CharBuffer.wrap(cs); this.mark_cbuf = NO_MARK; @@ -141,7 +138,7 @@ public class CharSequenceInputStream extends InputStream { */ private void fillBuffer() throws CharacterCodingException { this.bbuf.compact(); - final CoderResult result = this.encoder.encode(this.cbuf, this.bbuf, true); + final CoderResult result = this.charsetEncoder.encode(this.cbuf, this.bbuf, true); if (result.isError()) { result.throwException(); } @@ -232,7 +229,7 @@ public class CharSequenceInputStream extends InputStream { if (this.mark_cbuf != NO_MARK) { // if cbuf is at 0, we have not started reading anything, so skip re-encoding if (this.cbuf.position() != 0) { - this.encoder.reset(); + this.charsetEncoder.reset(); this.cbuf.rewind(); this.bbuf.rewind(); this.bbuf.limit(0); // rewind does not clear the buffer diff --git a/src/main/java/org/apache/commons/io/input/ReaderInputStream.java b/src/main/java/org/apache/commons/io/input/ReaderInputStream.java index de20bcb..4a52715 100644 --- a/src/main/java/org/apache/commons/io/input/ReaderInputStream.java +++ b/src/main/java/org/apache/commons/io/input/ReaderInputStream.java @@ -30,51 +30,46 @@ import java.nio.charset.CodingErrorAction; import java.util.Objects; /** - * {@link InputStream} implementation that reads a character stream from a {@link Reader} - * and transforms it to a byte stream using a specified charset encoding. The stream - * is transformed using a {@link CharsetEncoder} object, guaranteeing that all charset - * encodings supported by the JRE are handled correctly. In particular for charsets such as - * UTF-16, the implementation ensures that one and only one byte order marker - * is produced. + * {@link InputStream} implementation that reads a character stream from a {@link Reader} and transforms it to a byte + * stream using a specified charset encoding. The stream is transformed using a {@link CharsetEncoder} object, + * guaranteeing that all charset encodings supported by the JRE are handled correctly. In particular for charsets such + * as UTF-16, the implementation ensures that one and only one byte order marker is produced. * <p> - * Since in general it is not possible to predict the number of characters to be read from the - * {@link Reader} to satisfy a read request on the {@link ReaderInputStream}, all reads from - * the {@link Reader} are buffered. There is therefore no well defined correlation - * between the current position of the {@link Reader} and that of the {@link ReaderInputStream}. - * This also implies that in general there is no need to wrap the underlying {@link Reader} + * Since in general it is not possible to predict the number of characters to be read from the {@link Reader} to satisfy + * a read request on the {@link ReaderInputStream}, all reads from the {@link Reader} are buffered. There is therefore + * no well defined correlation between the current position of the {@link Reader} and that of the + * {@link ReaderInputStream}. This also implies that in general there is no need to wrap the underlying {@link Reader} * in a {@link java.io.BufferedReader}. * </p> * <p> - * {@link ReaderInputStream} implements the inverse transformation of {@link java.io.InputStreamReader}; - * in the following example, reading from {@code in2} would return the same byte - * sequence as reading from {@code in} (provided that the initial byte sequence is legal - * with respect to the charset encoding): + * {@link ReaderInputStream} implements the inverse transformation of {@link java.io.InputStreamReader}; in the + * following example, reading from {@code in2} would return the same byte sequence as reading from {@code in} (provided + * that the initial byte sequence is legal with respect to the charset encoding): * </p> + * * <pre> * InputStream inputStream = ... * Charset cs = ... * InputStreamReader reader = new InputStreamReader(inputStream, cs); - * ReaderInputStream in2 = new ReaderInputStream(reader, cs);</pre> + * ReaderInputStream in2 = new ReaderInputStream(reader, cs); + * </pre> * <p> - * {@link ReaderInputStream} implements the same transformation as {@link java.io.OutputStreamWriter}, - * except that the control flow is reversed: both classes transform a character stream - * into a byte stream, but {@link java.io.OutputStreamWriter} pushes data to the underlying stream, - * while {@link ReaderInputStream} pulls it from the underlying stream. + * {@link ReaderInputStream} implements the same transformation as {@link java.io.OutputStreamWriter}, except that the + * control flow is reversed: both classes transform a character stream into a byte stream, but + * {@link java.io.OutputStreamWriter} pushes data to the underlying stream, while {@link ReaderInputStream} pulls it + * from the underlying stream. * </p> * <p> - * Note that while there are use cases where there is no alternative to using - * this class, very often the need to use this class is an indication of a flaw - * in the design of the code. This class is typically used in situations where an existing - * API only accepts an {@link InputStream}, but where the most natural way to produce the data - * is as a character stream, i.e. by providing a {@link Reader} instance. An example of a situation - * where this problem may appear is when implementing the {@code javax.activation.DataSource} - * interface from the Java Activation Framework. + * Note that while there are use cases where there is no alternative to using this class, very often the need to use + * this class is an indication of a flaw in the design of the code. This class is typically used in situations where an + * existing API only accepts an {@link InputStream}, but where the most natural way to produce the data is as a + * character stream, i.e. by providing a {@link Reader} instance. An example of a situation where this problem may + * appear is when implementing the {@code javax.activation.DataSource} interface from the Java Activation Framework. * </p> * <p> - * Given the fact that the {@link Reader} class doesn't provide any way to predict whether the next - * read operation will block or not, it is not possible to provide a meaningful - * implementation of the {@link InputStream#available()} method. A call to this method - * will always return 0. Also, this class doesn't support {@link InputStream#mark(int)}. + * Given the fact that the {@link Reader} class doesn't provide any way to predict whether the next read operation will + * block or not, it is not possible to provide a meaningful implementation of the {@link InputStream#available()} + * method. A call to this method will always return 0. Also, this class doesn't support {@link InputStream#mark(int)}. * </p> * <p> * Instances of {@link ReaderInputStream} are not thread safe. @@ -86,35 +81,41 @@ import java.util.Objects; public class ReaderInputStream extends InputStream { private static final int DEFAULT_BUFFER_SIZE = 1024; - private static int checkBufferSize(int bufferSize) { - if (bufferSize < 2) { - throw new IllegalArgumentException("Buffer size < 2"); + static int checkMinBufferSize(final CharsetEncoder charsetEncoder, final int bufferSize) { + final float minRequired = minBufferSize(charsetEncoder); + if (bufferSize < minRequired) { + throw new IllegalArgumentException( + String.format("Buffer size %,d must be at least %s for a CharsetEncoder %s.", bufferSize, minRequired, charsetEncoder.charset().displayName())); } return bufferSize; } + + static float minBufferSize(final CharsetEncoder charsetEncoder) { + return charsetEncoder.maxBytesPerChar() * 2; + } + private final Reader reader; - private final CharsetEncoder encoder; + private final CharsetEncoder charsetEncoder; /** - * CharBuffer used as input for the decoder. It should be reasonably - * large as we read data from the underlying Reader into this buffer. + * CharBuffer used as input for the decoder. It should be reasonably large as we read data from the underlying Reader + * into this buffer. */ private final CharBuffer encoderIn; - /** - * ByteBuffer used as output for the decoder. This buffer can be small - * as it is only used to transfer data from the decoder to the - * buffer provided by the caller. + * ByteBuffer used as output for the decoder. This buffer can be small as it is only used to transfer data from the + * decoder to the buffer provided by the caller. */ private final ByteBuffer encoderOut; + private CoderResult lastCoderResult; private boolean endOfInput; /** - * Constructs a new {@link ReaderInputStream} that uses the default character encoding - * with a default input buffer size of {@value #DEFAULT_BUFFER_SIZE} characters. + * Constructs a new {@link ReaderInputStream} that uses the default character encoding with a default input buffer size + * of {@value #DEFAULT_BUFFER_SIZE} characters. * * @param reader the target {@link Reader} * @deprecated 2.5 use {@link #ReaderInputStream(Reader, Charset)} instead @@ -125,8 +126,8 @@ public class ReaderInputStream extends InputStream { } /** - * Constructs a new {@link ReaderInputStream} with a default input buffer size of - * {@value #DEFAULT_BUFFER_SIZE} characters. + * Constructs a new {@link ReaderInputStream} with a default input buffer size of {@value #DEFAULT_BUFFER_SIZE} + * characters. * * @param reader the target {@link Reader} * @param charset the charset encoding @@ -148,7 +149,7 @@ public class ReaderInputStream extends InputStream { charset.newEncoder() .onMalformedInput(CodingErrorAction.REPLACE) .onUnmappableCharacter(CodingErrorAction.REPLACE), - checkBufferSize(bufferSize)); + bufferSize); // @formatter:on } @@ -156,33 +157,33 @@ public class ReaderInputStream extends InputStream { * Constructs a new {@link ReaderInputStream}. * * @param reader the target {@link Reader} - * @param encoder the charset encoder + * @param charsetEncoder the charset encoder * @since 2.1 */ - public ReaderInputStream(final Reader reader, final CharsetEncoder encoder) { - this(reader, encoder, DEFAULT_BUFFER_SIZE); + public ReaderInputStream(final Reader reader, final CharsetEncoder charsetEncoder) { + this(reader, charsetEncoder, DEFAULT_BUFFER_SIZE); } /** * Constructs a new {@link ReaderInputStream}. * * @param reader the target {@link Reader} - * @param encoder the charset encoder + * @param charsetEncoder the charset encoder * @param bufferSize the size of the input buffer in number of characters * @since 2.1 */ - public ReaderInputStream(final Reader reader, final CharsetEncoder encoder, final int bufferSize) { + public ReaderInputStream(final Reader reader, final CharsetEncoder charsetEncoder, final int bufferSize) { this.reader = reader; - this.encoder = encoder; - this.encoderIn = CharBuffer.allocate(bufferSize); + this.charsetEncoder = charsetEncoder; + this.encoderIn = CharBuffer.allocate(checkMinBufferSize(charsetEncoder, bufferSize)); this.encoderIn.flip(); this.encoderOut = ByteBuffer.allocate(128); this.encoderOut.flip(); } /** - * Constructs a new {@link ReaderInputStream} with a default input buffer size of - * {@value #DEFAULT_BUFFER_SIZE} characters. + * Constructs a new {@link ReaderInputStream} with a default input buffer size of {@value #DEFAULT_BUFFER_SIZE} + * characters. * * @param reader the target {@link Reader} * @param charsetName the name of the charset encoding @@ -203,8 +204,8 @@ public class ReaderInputStream extends InputStream { } /** - * Close the stream. This method will cause the underlying {@link Reader} - * to be closed. + * Close the stream. This method will cause the underlying {@link Reader} to be closed. + * * @throws IOException if an I/O error occurs. */ @Override @@ -215,8 +216,7 @@ public class ReaderInputStream extends InputStream { /** * Fills the internal char buffer from the reader. * - * @throws IOException - * If an I/O error occurs + * @throws IOException If an I/O error occurs */ private void fillBuffer() throws IOException { if (!endOfInput && (lastCoderResult == null || lastCoderResult.isUnderflow())) { @@ -234,7 +234,7 @@ public class ReaderInputStream extends InputStream { encoderIn.flip(); } encoderOut.compact(); - lastCoderResult = encoder.encode(encoderIn, encoderOut, endOfInput); + lastCoderResult = charsetEncoder.encode(encoderIn, encoderOut, endOfInput); if (lastCoderResult.isError()) { lastCoderResult.throwException(); } @@ -244,8 +244,7 @@ public class ReaderInputStream extends InputStream { /** * Read a single byte. * - * @return either the byte read or {@code -1} if the end of the stream - * has been reached + * @return either the byte read or {@code -1} if the end of the stream has been reached * @throws IOException if an I/O error occurs. */ @Override @@ -265,8 +264,7 @@ public class ReaderInputStream extends InputStream { * Read the specified number of bytes into an array. * * @param b the byte array to read into - * @return the number of bytes read or {@code -1} - * if the end of the stream has been reached + * @return the number of bytes read or {@code -1} if the end of the stream has been reached * @throws IOException if an I/O error occurs. */ @Override @@ -280,8 +278,7 @@ public class ReaderInputStream extends InputStream { * @param array the byte array to read into * @param off the offset to start reading bytes into * @param len the number of bytes to read - * @return the number of bytes read or {@code -1} - * if the end of the stream has been reached + * @return the number of bytes read or {@code -1} if the end of the stream has been reached * @throws IOException if an I/O error occurs. */ @Override diff --git a/src/test/java/org/apache/commons/io/input/CharSequenceInputStreamTest.java b/src/test/java/org/apache/commons/io/input/CharSequenceInputStreamTest.java index 3dfaba0..13dee59 100644 --- a/src/test/java/org/apache/commons/io/input/CharSequenceInputStreamTest.java +++ b/src/test/java/org/apache/commons/io/input/CharSequenceInputStreamTest.java @@ -25,6 +25,7 @@ import static org.junit.jupiter.api.Assertions.fail; import java.io.IOException; import java.io.InputStream; import java.nio.charset.Charset; +import java.nio.charset.StandardCharsets; import java.util.Random; import java.util.Set; @@ -264,12 +265,14 @@ private boolean isAvailabilityTestableForCharset(final String csName) { @Test public void testIO_356_Loop_UTF16() throws Exception { - testIO_356_Loop("UTF-16", 4); + final Charset charset = StandardCharsets.UTF_16; + testIO_356_Loop(charset.displayName(), (int) ReaderInputStream.minBufferSize(charset.newEncoder())); } @Test public void testIO_356_Loop_UTF8() throws Exception { - testIO_356_Loop("UTF-8", 4); + final Charset charset = StandardCharsets.UTF_8; + testIO_356_Loop(charset.displayName(), (int) ReaderInputStream.minBufferSize(charset.newEncoder())); } @Test diff --git a/src/test/java/org/apache/commons/io/input/ReaderInputStreamTest.java b/src/test/java/org/apache/commons/io/input/ReaderInputStreamTest.java index 5c985da..adabf8b 100644 --- a/src/test/java/org/apache/commons/io/input/ReaderInputStreamTest.java +++ b/src/test/java/org/apache/commons/io/input/ReaderInputStreamTest.java @@ -52,13 +52,16 @@ public class ReaderInputStreamTest { @Test public void testBufferTooSmall() throws IOException { + assertThrows(IllegalArgumentException.class, () -> new ReaderInputStream(new StringReader("\uD800"), StandardCharsets.UTF_8, -1)); + assertThrows(IllegalArgumentException.class, () -> new ReaderInputStream(new StringReader("\uD800"), StandardCharsets.UTF_8, 0)); assertThrows(IllegalArgumentException.class, () -> new ReaderInputStream(new StringReader("\uD800"), StandardCharsets.UTF_8, 1)); } @Test @Timeout(value = 500, unit = TimeUnit.MILLISECONDS) public void testBufferSmallest() throws IOException { - try (InputStream in = new ReaderInputStream(new StringReader("\uD800"), StandardCharsets.UTF_8, 2)) { + final Charset charset = StandardCharsets.UTF_8; + try (InputStream in = new ReaderInputStream(new StringReader("\uD800"), charset, (int) ReaderInputStream.minBufferSize(charset.newEncoder()))) { in.read(); } } @@ -85,8 +88,9 @@ public class ReaderInputStreamTest { @Test @Timeout(value = 500, unit = TimeUnit.MILLISECONDS) public void testCodingErrorAction() throws IOException { - CharsetEncoder encoder = StandardCharsets.UTF_8.newEncoder().onMalformedInput(CodingErrorAction.REPORT); - try (InputStream in = new ReaderInputStream(new StringReader("\uD800aa"), encoder, 2)) { + final Charset charset = StandardCharsets.UTF_8; + CharsetEncoder encoder = charset.newEncoder().onMalformedInput(CodingErrorAction.REPORT); + try (InputStream in = new ReaderInputStream(new StringReader("\uD800aa"), encoder, (int) ReaderInputStream.minBufferSize(charset.newEncoder()))) { assertThrows(CharacterCodingException.class, in::read); } }