This is an automated email from the ASF dual-hosted git repository.
desruisseaux pushed a commit to branch geoapi-4.0
in repository https://gitbox.apache.org/repos/asf/sis.git
The following commit(s) were added to refs/heads/geoapi-4.0 by this push:
new 04d4eee7a7 Documentation, renaming or minor reorganization. No
significant code change in this commit.
04d4eee7a7 is described below
commit 04d4eee7a7af7c4eb206907636a7b35a02cdfbc0
Author: Martin Desruisseaux <[email protected]>
AuthorDate: Thu Jul 7 18:04:02 2022 +0200
Documentation, renaming or minor reorganization.
No significant code change in this commit.
---
.../apache/sis/internal/storage/inflater/LZW.java | 133 ++++++++++++++-------
1 file changed, 90 insertions(+), 43 deletions(-)
diff --git
a/storage/sis-geotiff/src/main/java/org/apache/sis/internal/storage/inflater/LZW.java
b/storage/sis-geotiff/src/main/java/org/apache/sis/internal/storage/inflater/LZW.java
index a2cd2d4d1b..bdff8a2024 100644
---
a/storage/sis-geotiff/src/main/java/org/apache/sis/internal/storage/inflater/LZW.java
+++
b/storage/sis-geotiff/src/main/java/org/apache/sis/internal/storage/inflater/LZW.java
@@ -54,6 +54,14 @@ final class LZW extends CompressionChannel {
*/
private static final int FIRST_ADAPTATIVE_CODE = 258;
+ /**
+ * For computing value of {@link #indexOfFreeEntry} when {@link #codeSize}
needs to be incremented.
+ * TIFF specification said that the size needs to be incremented after
codes 510, 1022 and 2046 are
+ * added to the {@link #entriesForCodes} table. Those values are a little
bit lower than what we
+ * would expect if the full integer ranges were used.
+ */
+ private static final int OFFSET_TO_MAXIMUM = 1;
+
/**
* Initial number of bits in a code. TIFF specification said that the size
needs to be
* incremented after codes 510, 1022 and 2046 are added to the {@link
#entriesForCodes} table.
@@ -72,37 +80,59 @@ final class LZW extends CompressionChannel {
private int codeSize;
/**
- * Size of the offset stored in {@link #entriesForCodes} array. We take
{@value #MAX_CODE_SIZE} bits for the
- * length and use the remaining for the offset. The rational is that even
in the worst case scenario where
- * the same byte is always appended to the sequence, the maximal length
can not exceeded the dictionary size
- * because a {@link #CLEAR_CODE} will be emitted when the dictionary is
full.
+ * Position of the lowest bit in an {@link #entriesForCodes} element where
the length is stored.
+ * The length is extracted from an element by {@code element >>>
LOWEST_LENGTH_BIT}.
+ * The offset is chosen for allowing {@value #MAX_CODE_SIZE} bits for
storing the length.
+ *
+ * <div class="note"><b>Rational:</b>
+ * even in the worst case scenario where the same byte is always appended
to the sequence,
+ * the maximal length can not exceeded the dictionary size because a
{@link #CLEAR_CODE}
+ * will be emitted when the dictionary is full.</div>
*/
- private static final int OFFSET_SIZE = Integer.SIZE - MAX_CODE_SIZE;
+ private static final int LOWEST_LENGTH_BIT = Integer.SIZE - MAX_CODE_SIZE;
/**
- * The mask to apply on a {@link #entriesForCodes} value for getting the
offset.
+ * Extracts the number of bytes of an entry stored in the {@link
#stringsFromCode} array.
+ *
+ * @param element an element of the {@link #entriesForCodes} array.
+ * @return number of consecutive bytes to read in {@link #stringsFromCode}
array.
*/
- private static final int OFFSET_MASK = (1 << OFFSET_SIZE) - 1;
+ private static int length(final int element) {
+ return element >>> LOWEST_LENGTH_BIT;
+ }
/**
- * Sequences of bytes associated to codes. For a given code <var>c</var>,
- * the first byte is {@code entriesForCodes[entriesForCodes[c] &
OFFSET_MASK]}
- * and the length is {@code entriesForCodes[c] >>> OFFSET_SIZE}.
+ * The mask to apply on an {@link #entriesForCodes} element for getting
the offset.
+ * The actual offset is {@code (element & OFFSET_MASK)}.
*/
- private byte[] stringsFromCode;
+ private static final int OFFSET_MASK = (1 << LOWEST_LENGTH_BIT) - 1;
/**
- * Pointers to byte sequences for a code in the {@link #entriesForCodes}
array.
- * The lowest {@value #OFFSET_SIZE} bits are offsets to first byte and the
highest
- * {@value #MAX_CODE_SIZE} bits are lengths.
+ * Extracts the index of the first byte of an entry stored in the {@link
#stringsFromCode} array.
+ *
+ * @param element an element of the {@link #entriesForCodes} array.
+ * @return index of the first byte to read in {@link #stringsFromCode}
array.
*/
- private final int[] entriesForCodes;
+ private static int offset(final int element) {
+ return element & OFFSET_MASK;
+ }
/**
- * If some bytes could not be written in previous {@code read(…)}
execution because the target buffer was full,
- * offset and length of those bytes. Otherwise 0. This is a value from
{@link #entriesForCodes} array.
+ * Encodes an offset together with its length.
*/
- private int pendingEntry;
+ private static int offsetAndLength(final int offset, final int length) {
+ final int element = offset | (length << LOWEST_LENGTH_BIT);
+ assert offset(element) == offset : offset;
+ assert length(element) == length : length;
+ return element;
+ }
+
+ /**
+ * Pointers to byte sequences for a code in the {@link #entriesForCodes}
array.
+ * Each element is a value encoded by {@link #offsetAndLength(int, int)}
method.
+ * Elements are decoded by {@link #offset(int)} {@link #length(int)}
methods.
+ */
+ private final int[] entriesForCodes;
/**
* Offset and length of the sequence of bytes associated to the code in
previous iteration.
@@ -110,6 +140,12 @@ final class LZW extends CompressionChannel {
*/
private int previousEntry;
+ /**
+ * If some bytes could not be written in previous {@code read(…)}
execution because the target buffer was full,
+ * offset and length of those bytes. Otherwise 0. This is a value from
{@link #entriesForCodes} array.
+ */
+ private int pendingEntry;
+
/**
* Index of the next entry available in {@link #entriesForCodes}.
* Shall not be lower than {@value #FIRST_ADAPTATIVE_CODE}.
@@ -122,6 +158,13 @@ final class LZW extends CompressionChannel {
*/
private int indexOfFreeString;
+ /**
+ * Sequences of bytes associated to codes. For a given <var>c</var> code
read from the stream,
+ * the first uncompressed byte is {@code
stringsFromCode(offset(entriesForCodes[c]))} and the
+ * number of bytes is {@code length(entriesForCodes[c])}.
+ */
+ private byte[] stringsFromCode;
+
/**
* Whether the {@link #EOI_CODE} has been found.
*/
@@ -136,13 +179,13 @@ final class LZW extends CompressionChannel {
*/
public LZW(final ChannelDataInput input) {
super(input);
- entriesForCodes = new int [(1 << MAX_CODE_SIZE) - 1];
- stringsFromCode = new byte[4 << MAX_CODE_SIZE]; // Dynamically
expanded if needed.
- for (int i=0; i <= (1 << Byte.SIZE); i++) {
- entriesForCodes[i] = i | (1 << OFFSET_SIZE);
+ indexOfFreeEntry = FIRST_ADAPTATIVE_CODE;
+ entriesForCodes = new int [(1 << MAX_CODE_SIZE) - OFFSET_TO_MAXIMUM];
+ stringsFromCode = new byte[4 << MAX_CODE_SIZE]; // Dynamically
expanded if needed.
+ for (int i=0; i < (1 << Byte.SIZE); i++) {
+ entriesForCodes[i] = i | (1 << LOWEST_LENGTH_BIT);
stringsFromCode[i] = (byte) i;
}
- indexOfFreeEntry = FIRST_ADAPTATIVE_CODE;
}
/**
@@ -171,15 +214,6 @@ final class LZW extends CompressionChannel {
codeSize = MIN_CODE_SIZE;
}
- /**
- * Encodes an offset together with its length.
- */
- private static int offsetAndLength(final int offset, final int length) {
- assert length >= 0 && length < (1 << MAX_CODE_SIZE) : length;
- assert offset >= 0 && offset < (1 << OFFSET_SIZE) : offset;
- return (length << OFFSET_SIZE) | offset;
- }
-
/**
* Decompresses some bytes from the {@linkplain #input input} into the
given destination buffer.
*
@@ -197,7 +231,7 @@ final class LZW extends CompressionChannel {
if (pendingEntry != 0) {
final int remaining = target.remaining();
final int stringOffset = pendingEntry & OFFSET_MASK;
- final int stringLength = pendingEntry >>> OFFSET_SIZE;
+ final int stringLength = pendingEntry >>> LOWEST_LENGTH_BIT;
target.put(stringsFromCode, stringOffset, Math.min(stringLength,
remaining));
if (stringLength <= remaining) {
pendingEntry = 0;
@@ -217,14 +251,14 @@ final class LZW extends CompressionChannel {
* The body is a little bit more complex because codes in [0 … 255]
range are
* handled as a special case instead of stored in the
`entriesForCodes` table.
*/
- int stringOffset = previousEntry & OFFSET_MASK;
- int stringLength = previousEntry >>> OFFSET_SIZE;
- int maximumIndex = (1 << codeSize) - 1;
+ int stringOffset = offset(previousEntry);
+ int stringLength = length(previousEntry);
+ int maximumIndex = (1 << codeSize) - OFFSET_TO_MAXIMUM;
int code;
while ((code = (int) input.readBits(codeSize)) != EOI_CODE) { //
GetNextCode()
if (code == CLEAR_CODE) {
clearTable(); // InitializeTable()
- maximumIndex = (1 << MIN_CODE_SIZE) - 1;
+ maximumIndex = (1 << MIN_CODE_SIZE) - OFFSET_TO_MAXIMUM;
/*
* We should not have consecutive clear codes, but it is easy
to check for safety.
* The first valid code after `CLEAR_CODE` shall be a byte. If
we reached the end
@@ -232,9 +266,9 @@ final class LZW extends CompressionChannel {
*/
do code = finished() ? EOI_CODE : (int)
input.readBits(MIN_CODE_SIZE); // GetNextCode()
while (code == CLEAR_CODE);
- if (code == EOI_CODE) break;
if ((code & ~0xFF) != 0) {
- throw unexpectedData();
+ if (code == EOI_CODE) break;
+ else throw unexpectedData();
}
/*
* The code to add to the table is a single byte in the [0 …
255] range.
@@ -267,7 +301,11 @@ final class LZW extends CompressionChannel {
final int newLength = stringLength + 1;
final int lastNewByte = newOffset + stringLength;
if (lastNewByte >= stringsFromCode.length) {
- stringsFromCode = Arrays.copyOf(stringsFromCode,
stringsFromCode.length * 2);
+ final int capacity = Math.min(indexOfFreeString * 2,
OFFSET_MASK + 1);
+ if (lastNewByte >= capacity) {
+ throw new IOException("Dictionary overflow");
+ }
+ stringsFromCode = Arrays.copyOf(stringsFromCode, capacity);
}
System.arraycopy(stringsFromCode, stringOffset,
stringsFromCode, newOffset, stringLength);
/*
@@ -279,11 +317,19 @@ final class LZW extends CompressionChannel {
*/
final int entryForCode = entriesForCodes[code];
if (entryForCode != 0) {
// if (IsInTable(Code))
- stringOffset = entryForCode & OFFSET_MASK;
// StringFromCode(Code)
- stringLength = entryForCode >>> OFFSET_SIZE;
+ stringOffset = offset(entryForCode);
// StringFromCode(Code)
+ stringLength = length(entryForCode);
} else {
stringOffset = newOffset; // StringFromCode(OldCode)
+ FirstChar(StringFromCode(OldCode)
stringLength = newLength;
+ /*
+ * In well-formed LZW stream, we should have `code ==
indexOfFreeEntry` here.
+ * However some invalid values are found in practice. We
need `code` to refer
+ * to the entry that we add to the dictionary, otherwise
inconsistencies will
+ * happen during the next iteration (when using
`OldCode`). This is implicitly
+ * ensured by not using `code` directly in next iteration,
but reusing instead
+ * `stringOffset` and `stringLength`.
+ */
}
/*
* Add the missing byte in the new entry. That byte is
`FirstChar(StringFromCode(Code | OldCode)))`.
@@ -301,7 +347,7 @@ final class LZW extends CompressionChannel {
}
if (++indexOfFreeEntry == maximumIndex) {
if (codeSize < MAX_CODE_SIZE) {
- maximumIndex = (1 << ++codeSize) - 1;
+ maximumIndex = (1 << ++codeSize) - OFFSET_TO_MAXIMUM;
} else {
/*
* Incrementing the size to 13 bits is an error
because the TIFF specification
@@ -309,6 +355,7 @@ final class LZW extends CompressionChannel {
* immediately after this code. If this is not the
case, we will have an index
* out of bounds exception in the next iteration,
which is caught above.
*/
+ // Opportunistic check if the size allocated in
constructor is right.
assert indexOfFreeEntry == entriesForCodes.length :
indexOfFreeEntry;
}
}