add lazy matching to LZ77 compressors
Project: http://git-wip-us.apache.org/repos/asf/commons-compress/repo Commit: http://git-wip-us.apache.org/repos/asf/commons-compress/commit/f7e0a4a0 Tree: http://git-wip-us.apache.org/repos/asf/commons-compress/tree/f7e0a4a0 Diff: http://git-wip-us.apache.org/repos/asf/commons-compress/diff/f7e0a4a0 Branch: refs/heads/master Commit: f7e0a4a0212424edd1765d1abe66772af069dafb Parents: 86e2bd0 Author: Stefan Bodewig <bode...@apache.org> Authored: Sun Apr 2 13:09:54 2017 +0200 Committer: Stefan Bodewig <bode...@apache.org> Committed: Sun Apr 2 13:09:54 2017 +0200 ---------------------------------------------------------------------- .../compressors/lz77support/LZ77Compressor.java | 32 +++++++++++ .../compressors/lz77support/Parameters.java | 58 ++++++++++++++++++-- 2 files changed, 85 insertions(+), 5 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/commons-compress/blob/f7e0a4a0/src/main/java/org/apache/commons/compress/compressors/lz77support/LZ77Compressor.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/compress/compressors/lz77support/LZ77Compressor.java b/src/main/java/org/apache/commons/compress/compressors/lz77support/LZ77Compressor.java index c284f4c..5379259 100644 --- a/src/main/java/org/apache/commons/compress/compressors/lz77support/LZ77Compressor.java +++ b/src/main/java/org/apache/commons/compress/compressors/lz77support/LZ77Compressor.java @@ -394,6 +394,8 @@ public class LZ77Compressor { private void compress() throws IOException { final int minMatch = params.getMinBackReferenceLength(); + final boolean lazy = params.getLazyMatching(); + final int lazyThreshold = params.getLazyMatchingThreshold(); while (lookahead >= minMatch) { catchUpMissedInserts(); @@ -402,6 +404,11 @@ public class LZ77Compressor { if (hashHead != NO_MATCH && hashHead - currentPosition <= params.getMaxOffset()) { // sets matchStart as a side effect matchLength = longestMatch(hashHead); + + if (lazy && matchLength <= lazyThreshold && lookahead > minMatch) { + // try to find a longer match using the next position + matchLength = longestMatchForNextPosition(matchLength); + } } if (matchLength >= minMatch) { if (blockStart != currentPosition) { @@ -441,6 +448,31 @@ public class LZ77Compressor { return hashHead; } + private int longestMatchForNextPosition(final int prevMatchLength) { + // save a bunch of values to restore them if the next match isn't better than the current one + final int prevMatchStart = matchStart; + final int prevInsertHash = insertHash; + + lookahead--; + currentPosition++; + int hashHead = insertString(currentPosition); + final int prevHashHead = prev[currentPosition & wMask]; + int matchLength = longestMatch(hashHead); + + if (matchLength <= prevMatchLength) { + // use the first match, as the next one isn't any better + matchLength = prevMatchLength; + matchStart = prevMatchStart; + + // restore modified values + head[insertHash] = prevHashHead; + insertHash = prevInsertHash; + currentPosition--; + lookahead++; + } + return matchLength; + } + private void insertStringsInMatch(int matchLength) { // inserts strings contained in current match // insertString inserts the byte 2 bytes after position, which may not yet be available -> missedInserts http://git-wip-us.apache.org/repos/asf/commons-compress/blob/f7e0a4a0/src/main/java/org/apache/commons/compress/compressors/lz77support/Parameters.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/compress/compressors/lz77support/Parameters.java b/src/main/java/org/apache/commons/compress/compressors/lz77support/Parameters.java index e4e9e47..5cdbae9 100644 --- a/src/main/java/org/apache/commons/compress/compressors/lz77support/Parameters.java +++ b/src/main/java/org/apache/commons/compress/compressors/lz77support/Parameters.java @@ -52,7 +52,8 @@ public final class Parameters { public static class Builder { private final int windowSize; private int minBackReferenceLength, maxBackReferenceLength, maxOffset, maxLiteralLength; - private Integer niceBackReferenceLength, maxCandidates; + private Integer niceBackReferenceLength, maxCandidates, lazyThreshold; + private Boolean lazyMatches; private Builder(int windowSize) { if (windowSize < 2 || !isPowerOfTwo(windowSize)) { @@ -173,6 +174,30 @@ public final class Parameters { } /** + * Sets whether lazy matching should be performed. + * + * <p>Lazy matching means that after a back-reference for a certain position has been found the compressor will + * try to find a longer match for the next position.</p> + * + * <p>Lazy matching is enabled by default and disabled when tuning for speed.</p> + */ + public Builder withLazyMatching(boolean lazy) { + lazyMatches = lazy; + return this; + } + + /** + * Sets the threshold for lazy matching. + * + * <p>Even if lazy matching is enabled it will not be performed if the length of the back-reference found for + * the current position is longer than this value.</p> + */ + public Builder withLazyThreshold(int threshold) { + lazyThreshold = threshold; + return this; + } + + /** * Changes the default setting for "nice back-reference length" and "maximum number of candidates" for improved * compression speed at the cost of compression ratio. * @@ -181,6 +206,8 @@ public final class Parameters { public Builder tunedForSpeed() { niceBackReferenceLength = Math.max(minBackReferenceLength, maxBackReferenceLength / 8); maxCandidates = Math.max(32, windowSize / 1024); + lazyMatches = false; + lazyThreshold = minBackReferenceLength; return this; } @@ -191,8 +218,9 @@ public final class Parameters { * <p>Use this method after configuring "maximum back-reference length".</p> */ public Builder tunedForCompressionRatio() { - niceBackReferenceLength = maxBackReferenceLength; + niceBackReferenceLength = lazyThreshold = maxBackReferenceLength; maxCandidates = Math.max(32, windowSize / 16); + lazyMatches = true; return this; } @@ -205,17 +233,21 @@ public final class Parameters { int niceLen = niceBackReferenceLength != null ? niceBackReferenceLength : Math.max(minBackReferenceLength, maxBackReferenceLength / 2); int candidates = maxCandidates != null ? maxCandidates : Math.max(256, windowSize / 128); + boolean lazy = lazyMatches != null ? lazyMatches : true; + int threshold = lazy ? (lazyThreshold != null ? lazyThreshold : niceLen) : minBackReferenceLength; return new Parameters(windowSize, minBackReferenceLength, maxBackReferenceLength, - maxOffset, maxLiteralLength, niceLen, candidates); + maxOffset, maxLiteralLength, niceLen, candidates, lazy, threshold); } } private final int windowSize, minBackReferenceLength, maxBackReferenceLength, maxOffset, maxLiteralLength, - niceBackReferenceLength, maxCandidates; + niceBackReferenceLength, maxCandidates, lazyThreshold; + private final boolean lazyMatching; private Parameters(int windowSize, int minBackReferenceLength, int maxBackReferenceLength, int maxOffset, - int maxLiteralLength, int niceBackReferenceLength, int maxCandidates) { + int maxLiteralLength, int niceBackReferenceLength, int maxCandidates, boolean lazyMatching, + int lazyThreshold) { this.windowSize = windowSize; this.minBackReferenceLength = minBackReferenceLength; this.maxBackReferenceLength = maxBackReferenceLength; @@ -223,6 +255,8 @@ public final class Parameters { this.maxLiteralLength = maxLiteralLength; this.niceBackReferenceLength = niceBackReferenceLength; this.maxCandidates = maxCandidates; + this.lazyMatching = lazyMatching; + this.lazyThreshold = lazyThreshold; } /** @@ -276,6 +310,20 @@ public final class Parameters { return maxCandidates; } + /** + * Gets whether to perform lazy matching. + */ + public boolean getLazyMatching() { + return lazyMatching; + } + + /** + * Gets the threshold for lazy matching. + */ + public int getLazyMatchingThreshold() { + return lazyThreshold; + } + private static final boolean isPowerOfTwo(int x) { // pre-condition: x > 0 return (x & (x - 1)) == 0;