This is an automated email from the ASF dual-hosted git repository. aherbert pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/commons-collections.git
commit fe88827643be091df5ed11d89d5f2e2ec4c61892 Author: Alex Herbert <aherb...@apache.org> AuthorDate: Sun Mar 8 13:48:10 2020 +0000 Move the unique filtering of the Hasher indexes to a separate class. --- .../bloomfilter/ArrayCountingBloomFilter.java | 19 +--- .../collections4/bloomfilter/IndexFilters.java | 84 +++++++++++++++++ .../bloomfilter/ArrayCountingBloomFilterTest.java | 63 ++++--------- .../bloomfilter/FixedIndexesTestHasher.java | 67 ++++++++++++++ .../collections4/bloomfilter/IndexFilterTest.java | 103 +++++++++++++++++++++ 5 files changed, 272 insertions(+), 64 deletions(-) diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/ArrayCountingBloomFilter.java b/src/main/java/org/apache/commons/collections4/bloomfilter/ArrayCountingBloomFilter.java index 9538f32..55e503f 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/ArrayCountingBloomFilter.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/ArrayCountingBloomFilter.java @@ -17,13 +17,10 @@ package org.apache.commons.collections4.bloomfilter; import java.util.BitSet; -import java.util.HashSet; import java.util.NoSuchElementException; import java.util.PrimitiveIterator; import java.util.PrimitiveIterator.OfInt; -import java.util.function.Consumer; import java.util.function.IntConsumer; -import java.util.Set; import org.apache.commons.collections4.bloomfilter.hasher.Hasher; import org.apache.commons.collections4.bloomfilter.hasher.Shape; @@ -324,7 +321,8 @@ public class ArrayCountingBloomFilter extends AbstractBloomFilter implements Cou */ private void applyAsHasher(final Hasher hasher, final IntConsumer action) { verifyHasher(hasher); - toSet(hasher).forEach(i -> action.accept(i)); + // We do not naturally handle duplicates so filter them. + IndexFilters.distinctIndexes(hasher, getShape(), action); } /** @@ -380,17 +378,4 @@ public class ArrayCountingBloomFilter extends AbstractBloomFilter implements Cou state |= updated; counts[idx] = updated; } - - /** - * Convert the hasher to a set. Duplicates indexes are removed. - * The unique indexes can be used to increment or decrement counts by 1. - * - * @param hasher the hasher - * @return the unique indexes - */ - private Set<Integer> toSet(Hasher hasher) { - final Set<Integer> lst = new HashSet<>(); - hasher.getBits(getShape()).forEachRemaining((Consumer<Integer>) lst::add); - return lst; - } } diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/IndexFilters.java b/src/main/java/org/apache/commons/collections4/bloomfilter/IndexFilters.java new file mode 100644 index 0000000..c9b81ba --- /dev/null +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/IndexFilters.java @@ -0,0 +1,84 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter; + +import org.apache.commons.collections4.bloomfilter.hasher.Hasher; +import org.apache.commons.collections4.bloomfilter.hasher.Shape; + +import java.util.Objects; +import java.util.Set; +import java.util.TreeSet; +import java.util.function.Consumer; +import java.util.function.IntConsumer; + +/** + * Contains functions to filter indexes. + */ +final class IndexFilters { + /** Do not instantiate. */ + private IndexFilters() { + } + + /** + * Transfer all distinct indexes in the specified {@code hasher} generated for the + * specified {@code shape} to the specified {@code consumer}. For example this + * can be used to merge a {@link Hasher} representation of a Bloom filter into a + * {@link BloomFilter} instance that does not naturally handle duplicate indexes. + * + * <p>This method is functionally equivalent to: + * + * <pre> + * final Set<Integer> distinct = new TreeSet<>(); + * hasher.getBits(shape).forEachRemaining((Consumer<Integer>) i -> { + * if (distinct.add(i)) { + * consumer.accept(i); + * } + * }); + * </pre> + * + * @param hasher the hasher + * @param shape the shape + * @param consumer the consumer to receive distinct indexes + * @throws NullPointerException if the hasher, shape or action are null + * @see Hasher#getBits(Shape) + */ + static void distinctIndexes(Hasher hasher, Shape shape, IntConsumer consumer) { + Objects.requireNonNull(hasher, "hasher"); + Objects.requireNonNull(shape, "shape"); + Objects.requireNonNull(consumer, "consumer"); + + // TODO + // This function can be optimised based on the expected size + // (number of indexes) of the hasher and the number of bits in the shape. + // + // A large size would benefit from a pre-allocated BitSet-type filter. + // A very small size may be more efficient as a simple array of values + // that have already been seen that is scanned for each new index. + // + // A default is to use a Set to filter distinct values. The choice of set + // should be evaluated. A HashSet would be optimal if size is known. + // A TreeSet has lower memory consumption and performance is not as + // sensitive to knowing the size in advance. + + final Set<Integer> distinct = new TreeSet<>(); + hasher.getBits(shape).forEachRemaining((Consumer<Integer>) i -> { + if (distinct.add(i)) { + consumer.accept(i); + } + }); + } +} diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/ArrayCountingBloomFilterTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/ArrayCountingBloomFilterTest.java index 0de90c8..1940287 100644 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/ArrayCountingBloomFilterTest.java +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/ArrayCountingBloomFilterTest.java @@ -23,14 +23,12 @@ import static org.junit.Assert.assertTrue; import java.util.Arrays; import java.util.HashMap; import java.util.Map; -import java.util.PrimitiveIterator.OfInt; import java.util.concurrent.ThreadLocalRandom; import java.util.function.BiConsumer; import java.util.function.BiPredicate; import java.util.function.Function; import java.util.function.ToIntBiFunction; -import org.apache.commons.collections4.bloomfilter.hasher.HashFunctionIdentity; import org.apache.commons.collections4.bloomfilter.hasher.Hasher; import org.apache.commons.collections4.bloomfilter.hasher.Shape; import org.junit.Test; @@ -40,35 +38,6 @@ import org.junit.Test; */ public class ArrayCountingBloomFilterTest extends AbstractBloomFilterTest { - /** - * A simple Hasher implementation to return indexes. Duplicates are allowed. - */ - private static class SimpleHasher implements Hasher { - private Shape shape; - private int[] indexes; - - SimpleHasher(Shape shape, int... indexes) { - this.shape = shape; - this.indexes = indexes; - } - - @Override - public OfInt getBits(Shape shape) { - // Assume shape is the same - return Arrays.stream(indexes).iterator(); - } - - @Override - public HashFunctionIdentity getHashFunctionIdentity() { - return shape.getHashFunctionIdentity(); - } - - @Override - public boolean isEmpty() { - return indexes.length == 0; - } - } - @Override protected ArrayCountingBloomFilter createEmptyFilter(final Shape shape) { return new ArrayCountingBloomFilter(shape); @@ -124,7 +93,7 @@ public class ArrayCountingBloomFilterTest extends AbstractBloomFilterTest { public void constructorTest_Hasher_Duplicates() { final int[] expected = {0, 1, 1, 0, 0, 1}; // Some indexes with duplicates - final Hasher hasher = new SimpleHasher(shape, 1, 2, 2, 5); + final Hasher hasher = new FixedIndexesTestHasher(shape, 1, 2, 2, 5); final ArrayCountingBloomFilter bf = createFilter(hasher, shape); final long[] lb = bf.getBits(); @@ -141,10 +110,10 @@ public class ArrayCountingBloomFilterTest extends AbstractBloomFilterTest { @Test public void contains_BloomFilter() { // Some indexes with duplicates - final Hasher hasher = new SimpleHasher(shape, 1, 2, 5); + final Hasher hasher = new FixedIndexesTestHasher(shape, 1, 2, 5); final ArrayCountingBloomFilter bf = createFilter(hasher, shape); - assertFalse(bf.contains(new BitSetBloomFilter(new SimpleHasher(shape, 3, 4), shape))); - assertTrue(bf.contains(new BitSetBloomFilter(new SimpleHasher(shape, 2, 5), shape))); + assertFalse(bf.contains(new BitSetBloomFilter(new FixedIndexesTestHasher(shape, 3, 4), shape))); + assertTrue(bf.contains(new BitSetBloomFilter(new FixedIndexesTestHasher(shape, 2, 5), shape))); } /** @@ -153,7 +122,7 @@ public class ArrayCountingBloomFilterTest extends AbstractBloomFilterTest { */ @Test public void mergeTest_Counts_CountingBloomFilter() { - assertMerge(counts -> createFilter(new SimpleHasher(shape, counts), shape), + assertMerge(counts -> createFilter(new FixedIndexesTestHasher(shape, counts), shape), BloomFilter::merge); } @@ -162,7 +131,7 @@ public class ArrayCountingBloomFilterTest extends AbstractBloomFilterTest { */ @Test public void mergeTest_Counts_BloomFilter() { - assertMerge(counts -> new BitSetBloomFilter(new SimpleHasher(shape, counts), shape), + assertMerge(counts -> new BitSetBloomFilter(new FixedIndexesTestHasher(shape, counts), shape), BloomFilter::merge); } @@ -171,7 +140,7 @@ public class ArrayCountingBloomFilterTest extends AbstractBloomFilterTest { */ @Test public void mergeTest_Counts_Hasher() { - assertMerge(counts -> new SimpleHasher(shape, counts), + assertMerge(counts -> new FixedIndexesTestHasher(shape, counts), BloomFilter::merge); } @@ -180,7 +149,7 @@ public class ArrayCountingBloomFilterTest extends AbstractBloomFilterTest { */ @Test public void mergeTest_Counts_Hasher_Duplicates() { - assertMerge(counts -> new SimpleHasher(shape, createDuplicates(counts)), + assertMerge(counts -> new FixedIndexesTestHasher(shape, createDuplicates(counts)), BloomFilter::merge); } @@ -190,7 +159,7 @@ public class ArrayCountingBloomFilterTest extends AbstractBloomFilterTest { */ @Test public void removeTest_Counts_CountingBloomFilter() { - assertRemove(counts -> createFilter(new SimpleHasher(shape, counts), shape), + assertRemove(counts -> createFilter(new FixedIndexesTestHasher(shape, counts), shape), CountingBloomFilter::remove); } @@ -199,7 +168,7 @@ public class ArrayCountingBloomFilterTest extends AbstractBloomFilterTest { */ @Test public void removeTest_Counts_BloomFilter() { - assertRemove(counts -> new BitSetBloomFilter(new SimpleHasher(shape, counts), shape), + assertRemove(counts -> new BitSetBloomFilter(new FixedIndexesTestHasher(shape, counts), shape), CountingBloomFilter::remove); } @@ -208,7 +177,7 @@ public class ArrayCountingBloomFilterTest extends AbstractBloomFilterTest { */ @Test public void removeTest_Counts_Hasher() { - assertRemove(counts -> new SimpleHasher(shape, counts), + assertRemove(counts -> new FixedIndexesTestHasher(shape, counts), CountingBloomFilter::remove); } @@ -217,7 +186,7 @@ public class ArrayCountingBloomFilterTest extends AbstractBloomFilterTest { */ @Test public void removeTest_Counts_Hasher_Duplicates() { - assertRemove(counts -> new SimpleHasher(shape, createDuplicates(counts)), + assertRemove(counts -> new FixedIndexesTestHasher(shape, createDuplicates(counts)), CountingBloomFilter::remove); } @@ -289,7 +258,7 @@ public class ArrayCountingBloomFilterTest extends AbstractBloomFilterTest { Function<int[], F> converter, BiConsumer<ArrayCountingBloomFilter, F> operation, int[] expected) { - final Hasher hasher = new SimpleHasher(shape, indexes1); + final Hasher hasher = new FixedIndexesTestHasher(shape, indexes1); final ArrayCountingBloomFilter bf = createFilter(hasher, shape); final F filter = converter.apply(indexes2); operation.accept(bf, filter); @@ -301,7 +270,7 @@ public class ArrayCountingBloomFilterTest extends AbstractBloomFilterTest { */ @Test public void mergeTest_Overflow() { - final Hasher hasher = new SimpleHasher(shape, 1, 2, 3); + final Hasher hasher = new FixedIndexesTestHasher(shape, 1, 2, 3); final ArrayCountingBloomFilter bf = createFilter(hasher, shape); ArrayCountingBloomFilter bf2 = createFromCounts(new int[] {0, 0, Integer.MAX_VALUE}); @@ -328,10 +297,10 @@ public class ArrayCountingBloomFilterTest extends AbstractBloomFilterTest { */ @Test public void removeTest_Negative() { - final Hasher hasher = new SimpleHasher(shape, 1, 2, 3); + final Hasher hasher = new FixedIndexesTestHasher(shape, 1, 2, 3); ArrayCountingBloomFilter bf = createFilter(hasher, shape); - final Hasher hasher2 = new SimpleHasher(shape, 2); + final Hasher hasher2 = new FixedIndexesTestHasher(shape, 2); ArrayCountingBloomFilter bf2 = createFilter(hasher2, shape); // More - Less = OK diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/FixedIndexesTestHasher.java b/src/test/java/org/apache/commons/collections4/bloomfilter/FixedIndexesTestHasher.java new file mode 100644 index 0000000..4477a93 --- /dev/null +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/FixedIndexesTestHasher.java @@ -0,0 +1,67 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter; + +import org.apache.commons.collections4.bloomfilter.hasher.HashFunctionIdentity; +import org.apache.commons.collections4.bloomfilter.hasher.Hasher; +import org.apache.commons.collections4.bloomfilter.hasher.Shape; + +import java.util.Arrays; +import java.util.PrimitiveIterator.OfInt; + +/** + * A Hasher implementation to return fixed indexes. Duplicates are allowed. + * The shape is ignored when generating the indexes. + * + * <p><strong>This is not a real hasher and is used for testing only</strong>. + */ +class FixedIndexesTestHasher implements Hasher { + /** The shape. */ + private Shape shape; + /** The indexes. */ + private int[] indexes; + + /** + * Create an instance. + * + * @param shape the shape + * @param indexes the indexes + */ + FixedIndexesTestHasher(Shape shape, int... indexes) { + this.shape = shape; + this.indexes = indexes; + } + + @Override + public OfInt getBits(Shape shape) { + if (!this.shape.equals(shape)) { + throw new IllegalArgumentException( + String.format("shape (%s) does not match internal shape (%s)", shape, this.shape)); + } + return Arrays.stream(indexes).iterator(); + } + + @Override + public HashFunctionIdentity getHashFunctionIdentity() { + return shape.getHashFunctionIdentity(); + } + + @Override + public boolean isEmpty() { + return indexes.length == 0; + } +} diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/IndexFilterTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/IndexFilterTest.java new file mode 100644 index 0000000..778960d --- /dev/null +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/IndexFilterTest.java @@ -0,0 +1,103 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter; + +import org.apache.commons.collections4.bloomfilter.hasher.HashFunctionIdentityImpl; +import org.apache.commons.collections4.bloomfilter.hasher.Shape; +import org.apache.commons.collections4.bloomfilter.hasher.HashFunctionIdentity.ProcessType; +import org.apache.commons.collections4.bloomfilter.hasher.HashFunctionIdentity.Signedness; +import org.junit.Assert; +import org.junit.Test; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Set; +import java.util.function.IntConsumer; +import java.util.stream.Collectors; + +/** + * Tests for the {@link IndexFilters}. + */ +public class IndexFilterTest { + + /** + * The shape of the dummy Bloom filter. + * This is used as an argument to a Hasher that just returns fixed indexes + * so the parameters do not matter. + */ + private Shape shape = new Shape(new HashFunctionIdentityImpl( + "Apache Commons Collections", "Dummy", Signedness.SIGNED, ProcessType.CYCLIC, 0L), + 50, 3000, 4); + + @Test + public void testApplyThrowsWithNullArguments() { + final FixedIndexesTestHasher hasher = new FixedIndexesTestHasher(shape, 1, 2, 3); + final Shape shape = this.shape; + final ArrayList<Integer> actual = new ArrayList<>(); + final IntConsumer consumer = actual::add; + + try { + IndexFilters.distinctIndexes(null, shape, consumer); + Assert.fail("null hasher"); + } catch (NullPointerException expected) { + // Ignore + } + + try { + IndexFilters.distinctIndexes(hasher, null, consumer); + Assert.fail("null shape"); + } catch (NullPointerException expected) { + // Ignore + } + + try { + IndexFilters.distinctIndexes(hasher, shape, null); + Assert.fail("null consumer"); + } catch (NullPointerException expected) { + // Ignore + } + + // All OK together + IndexFilters.distinctIndexes(hasher, shape, consumer); + } + + @Test + public void testApply() { + assertFilter(1, 4, 6, 7, 9); + } + + @Test + public void testApplyWithDuplicates() { + assertFilter(1, 4, 4, 6, 7, 7, 7, 7, 7, 9); + } + + private void assertFilter(int... indexes) { + final FixedIndexesTestHasher hasher = new FixedIndexesTestHasher(shape, indexes); + final Set<Integer> expected = Arrays.stream(indexes).boxed().collect(Collectors.toSet()); + final ArrayList<Integer> actual = new ArrayList<>(); + + IndexFilters.distinctIndexes(hasher, shape, actual::add); + + Assert.assertEquals(expected.size(), actual.size()); + // Check the array has all the values. + // We do not currently check the order of indexes from the + // hasher.getBits() function. + for (Integer index : actual) { + Assert.assertTrue(expected.contains(index)); + } + } +}