Author: bodewig
Date: Sun May 20 18:04:36 2012
New Revision: 1340786
URL: http://svn.apache.org/viewvc?rev=1340786&view=rev
Log:
Integrate fallback sort into the rest, add some more tests and fix bug in
bucket boundary calculation
Modified:
commons/proper/compress/trunk/src/main/java/org/apache/commons/compress/compressors/bzip2/BlockSort.java
commons/proper/compress/trunk/src/test/java/org/apache/commons/compress/compressors/bzip2/BlockSortTest.java
Modified:
commons/proper/compress/trunk/src/main/java/org/apache/commons/compress/compressors/bzip2/BlockSort.java
URL:
http://svn.apache.org/viewvc/commons/proper/compress/trunk/src/main/java/org/apache/commons/compress/compressors/bzip2/BlockSort.java?rev=1340786&r1=1340785&r2=1340786&view=diff
==============================================================================
---
commons/proper/compress/trunk/src/main/java/org/apache/commons/compress/compressors/bzip2/BlockSort.java
(original)
+++
commons/proper/compress/trunk/src/main/java/org/apache/commons/compress/compressors/bzip2/BlockSort.java
Sun May 20 18:04:36 2012
@@ -34,6 +34,7 @@ import java.util.BitSet;
* Compress" you'd get:</p>
*
* <pre>
+ * CompressCommons
* Commons Compress
* CompressCommons
* essCommons Compr
@@ -51,9 +52,9 @@ import java.util.BitSet;
* ssCommons Compre
* </pre>
*
- * <p>Which results in a new text "s romooCCmmpnse", in adition the
+ * <p>Which results in a new text "ss romooCCmmpnse", in adition the
* index of the first line that contained the original text is kept -
- * in this case it is 0. The idea is that in a long English text all
+ * in this case it is 1. The idea is that in a long English text all
* permutations that start with "he" are likely suffixes of a "the" and
* thus they end in "t" leading to a larger block of "t"s that can
* better be compressed by the subsequent Move-to-Front, run-length
@@ -101,7 +102,8 @@ class BlockSort {
* been dropped after 0.9.0 and replaced by a fallback sorting
* algorithm.
*
- * I've added the fallbackSort function of 1.0.6.
+ * I've added the fallbackSort function of 1.0.6 and tried to
+ * integrate it with the existing code without touching too much.
*/
/*
@@ -154,13 +156,16 @@ class BlockSort {
this.workDone = 0;
this.blockRandomised = false;
this.firstAttempt = true;
+
+ if (last + 1 < 10000) {
+ fallbackSort(data, last);
+ } else {
+
mainSort(data, last);
if (this.firstAttempt && (this.workDone > this.workLimit)) {
- randomiseBlock(data, last);
- this.workLimit = this.workDone = 0;
- this.firstAttempt = false;
- mainSort(data, last);
+ fallbackSort(data, last);
+ }
}
final int[] fmap = data.fmap;
@@ -173,7 +178,27 @@ class BlockSort {
}
// assert (data.origPtr != -1) : data.origPtr;
- return blockRandomised;
+ return false;
+ }
+
+ /**
+ * Adapt fallbackSort to the expected interface of the rest of the
+ * code, in particular deal with the fact that block starts at
+ * offset 1 (in libbzip2 1.0.6 it starts at 0).
+ */
+ final void fallbackSort(final BZip2CompressorOutputStream.Data data,
+ final int last) {
+ data.block[0] = data.block[last + 1];
+ fallbackSort(data.fmap, data.block, last + 1);
+ for (int i = 0; i < last + 1; i++) {
+ --data.fmap[i];
+ }
+ for (int i = 0; i < last + 1; i++) {
+ if (data.fmap[i] == -1) {
+ data.fmap[i] = last;
+ break;
+ }
+ }
}
/*---------------------------------------------*/
@@ -415,12 +440,13 @@ class BlockSort {
}
/*
- * The C code uses an array of ints to represents the bucket-start
- * flags (bhtab). It also contains optimizations to skip over 32
- * consecutively set or consecutively unset bits on word
- * boundaries at once. For now I've chosen to use the simpler but
- * potentially slower code using BitSet - also in the hope that
- * using the BitSet#nextXXX methods may be fast enough.
+ * The C code uses an array of ints (each int holding 32 flags) to
+ * represents the bucket-start flags (bhtab). It also contains
+ * optimizations to skip over 32 consecutively set or
+ * consecutively unset bits on word boundaries at once. For now
+ * I've chosen to use the simpler but potentially slower code
+ * using BitSet - also in the hope that using the BitSet#nextXXX
+ * methods may be fast enough.
*/
/**
@@ -432,16 +458,22 @@ class BlockSort {
* @param off offset of first byte to sort in block
*/
final void fallbackSort(int[] fmap, byte[] block, int nblock) {
- int[] ftab = new int[257];
+ final int[] ftab = new int[257];
int H, i, j, k, l, r, cc, cc1;
int nNotDone;
int nBhtab;
+ final int[] eclass = getEclass();
+ for (i = 0; i < nblock; i++) {
+ eclass[i] = 0;
+ }
/*--
LBZ2: Initial 1-char radix sort to generate
initial fmap and initial BH bits.
--*/
- for (i = 0; i < nblock; i++) ftab[block[i] & 0xff]++;
+ for (i = 0; i < nblock; i++) {
+ ftab[block[i] & 0xff]++;
+ }
for (i = 1; i < 257; i++) ftab[i] += ftab[i - 1];
for (i = 0; i < nblock; i++) {
@@ -467,8 +499,6 @@ class BlockSort {
bhtab.clear(nblock + 2 * i + 1);
}
- eclass = getEclass();
-
/*-- LBZ2: the log(N) loop --*/
H = 1;
while (true) {
@@ -491,10 +521,10 @@ class BlockSort {
/*-- LBZ2: find the next non-singleton bucket --*/
k = r + 1;
- k = bhtab.nextSetBit(k);
+ k = bhtab.nextClearBit(k);
l = k - 1;
if (l >= nblock) break;
- k = bhtab.nextClearBit(k);
+ k = bhtab.nextSetBit(k + 1);
r = k - 1;
if (r >= nblock) break;
@@ -862,8 +892,8 @@ class BlockSort {
private static final int SETMASK = (1 << 21);
private static final int CLEARMASK = (~SETMASK);
- private void mainSort(final BZip2CompressorOutputStream.Data dataShadow,
- final int lastShadow) {
+ final void mainSort(final BZip2CompressorOutputStream.Data dataShadow,
+ final int lastShadow) {
final int[] runningOrder = this.mainSort_runningOrder;
final int[] copy = this.mainSort_copy;
final boolean[] bigDone = this.mainSort_bigDone;
Modified:
commons/proper/compress/trunk/src/test/java/org/apache/commons/compress/compressors/bzip2/BlockSortTest.java
URL:
http://svn.apache.org/viewvc/commons/proper/compress/trunk/src/test/java/org/apache/commons/compress/compressors/bzip2/BlockSortTest.java?rev=1340786&r1=1340785&r2=1340786&view=diff
==============================================================================
---
commons/proper/compress/trunk/src/test/java/org/apache/commons/compress/compressors/bzip2/BlockSortTest.java
(original)
+++
commons/proper/compress/trunk/src/test/java/org/apache/commons/compress/compressors/bzip2/BlockSortTest.java
Sun May 20 18:04:36 2012
@@ -70,17 +70,56 @@ public class BlockSortTest {
0, 1, 7, 6, 8, 2, 3, 5, 4
};
+ private static final byte[] FIXTURE2 = {
+ 'C', 'o', 'm', 'm', 'o', 'n', 's', ' ', 'C', 'o', 'm', 'p', 'r', 'e',
's', 's',
+ };
+
+ private static final byte[] FIXTURE2_BWT = {
+ 's', 's', ' ', 'r', 'o', 'm', 'o', 'o', 'C', 'C', 'm', 'm', 'p', 'n',
's', 'e',
+ };
+
@Test
public void testSortFixture() {
- BZip2CompressorOutputStream.Data data = new
BZip2CompressorOutputStream.Data(1);
- System.arraycopy(FIXTURE, 0, data.block, 1, FIXTURE.length);
- BlockSort s = new BlockSort(data);
- assertFalse(s.blockSort(data, FIXTURE.length - 1));
- assertEquals(FIXTURE[FIXTURE.length - 1], data.block[0]);
- for (int i = 0; i < FIXTURE.length; i++) {
- assertEquals(FIXTURE_BWT[i], data.block[data.fmap[i]]);
- }
- assertEquals(0, data.origPtr);
+ DS ds = setUpFixture();
+ assertFalse(ds.s.blockSort(ds.data, FIXTURE.length - 1));
+ assertFixtureSorted(ds.data);
+ assertEquals(0, ds.data.origPtr);
+ }
+
+ @Test
+ public void testSortFixtureMainSort() {
+ DS ds = setUpFixture();
+ ds.s.mainSort(ds.data, FIXTURE.length - 1);
+ assertFixtureSorted(ds.data);
+ }
+
+ @Test
+ public void testSortFixtureFallbackSort() {
+ DS ds = setUpFixture();
+ ds.s.fallbackSort(ds.data, FIXTURE.length - 1);
+ assertFixtureSorted(ds.data);
+ }
+
+ @Test
+ public void testSortFixture2() {
+ DS ds = setUpFixture2();
+ assertFalse(ds.s.blockSort(ds.data, FIXTURE2.length - 1));
+ assertFixture2Sorted(ds.data);
+ assertEquals(1, ds.data.origPtr);
+ }
+
+ @Test
+ public void testSortFixture2MainSort() {
+ DS ds = setUpFixture2();
+ ds.s.mainSort(ds.data, FIXTURE2.length - 1);
+ assertFixture2Sorted(ds.data);
+ }
+
+ @Test
+ public void testSortFixture2FallbackSort() {
+ DS ds = setUpFixture2();
+ ds.s.fallbackSort(ds.data, FIXTURE2.length - 1);
+ assertFixture2Sorted(ds.data);
}
@Test
@@ -91,4 +130,43 @@ public class BlockSortTest {
s.fallbackSort(fmap, FIXTURE, FIXTURE.length);
assertArrayEquals(FIXTURE_SORTED, fmap);
}
+
+ private DS setUpFixture() {
+ return setUpFixture(FIXTURE);
+ }
+
+ private void assertFixtureSorted(BZip2CompressorOutputStream.Data data) {
+ assertFixtureSorted(data, FIXTURE, FIXTURE_BWT);
+ }
+
+ private DS setUpFixture2() {
+ return setUpFixture(FIXTURE2);
+ }
+
+ private void assertFixture2Sorted(BZip2CompressorOutputStream.Data data) {
+ assertFixtureSorted(data, FIXTURE2, FIXTURE2_BWT);
+ }
+
+ private DS setUpFixture(byte[] fixture) {
+ BZip2CompressorOutputStream.Data data = new
BZip2CompressorOutputStream.Data(1);
+ System.arraycopy(fixture, 0, data.block, 1, fixture.length);
+ return new DS(data, new BlockSort(data));
+ }
+
+ private void assertFixtureSorted(BZip2CompressorOutputStream.Data data,
+ byte[] fixture, byte[] fixtureBwt) {
+ assertEquals(fixture[fixture.length - 1], data.block[0]);
+ for (int i = 0; i < fixture.length; i++) {
+ assertEquals(fixtureBwt[i], data.block[data.fmap[i]]);
+ }
+ }
+
+ private static class DS {
+ private final BZip2CompressorOutputStream.Data data;
+ private final BlockSort s;
+ DS(BZip2CompressorOutputStream.Data data, BlockSort s) {
+ this.data = data;
+ this.s = s;
+ }
+ }
}
\ No newline at end of file