This is an automated email from the ASF dual-hosted git repository. aherbert pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/commons-collections.git
commit fb358a5c80a33e2e4ea46bba57a31148bc710f86 Author: Alex Herbert <aherb...@apache.org> AuthorDate: Mon Mar 2 22:25:45 2020 +0000 Added CountingBloomFilter interface and ArrayCountingBloomFilter. --- .../bloomfilter/ArrayCountingBloomFilter.java | 396 +++++++++++++++ .../bloomfilter/CountingBloomFilter.java | 179 +++++++ .../bloomfilter/ArrayCountingBloomFilterTest.java | 551 +++++++++++++++++++++ 3 files changed, 1126 insertions(+) diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/ArrayCountingBloomFilter.java b/src/main/java/org/apache/commons/collections4/bloomfilter/ArrayCountingBloomFilter.java new file mode 100644 index 0000000..9538f32 --- /dev/null +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/ArrayCountingBloomFilter.java @@ -0,0 +1,396 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter; + +import java.util.BitSet; +import java.util.HashSet; +import java.util.NoSuchElementException; +import java.util.PrimitiveIterator; +import java.util.PrimitiveIterator.OfInt; +import java.util.function.Consumer; +import java.util.function.IntConsumer; +import java.util.Set; + +import org.apache.commons.collections4.bloomfilter.hasher.Hasher; +import org.apache.commons.collections4.bloomfilter.hasher.Shape; +import org.apache.commons.collections4.bloomfilter.hasher.StaticHasher; + +/** + * A counting Bloom filter using an array to track counts for each enabled bit + * index. + * + * <p>Any operation that results in negative counts or integer overflow of counts will + * mark this filter as invalid. This transition is not reversible. The counts for the + * filter immediately prior to the operation that create invalid counts can be recovered. + * See the documentation in {@link #isValid()} for details. + * + * <p>All the operations in the filter assume the counts are currently valid. Behaviour + * of an invalid filter is undefined. It will no longer function identically to a standard + * Bloom filter that is the merge of all the Bloom filters that have been added + * to and not later subtracted from the counting Bloom filter. + * + * <p>The maximum supported number of items that can be stored in the filter is + * limited by the maximum array size combined with the {@link Shape}. For + * example an implementation using a {@link Shape} with a false-positive + * probability of 1e-6 and {@link Integer#MAX_VALUE} bits can reversibly store + * approximately 75 million items using 20 hash functions per item with a memory + * consumption of approximately 8 GB. + * + * @since 4.5 + * @see Shape + */ +public class ArrayCountingBloomFilter extends AbstractBloomFilter implements CountingBloomFilter { + + /** + * The count of each bit index in the filter. + */ + private final int[] counts; + + /** + * The state flag. This is a bitwise OR of the entire history of all updated + * counts. If negative then a negative count or integer overflow has occurred on + * one or more counts in the history of the filter and the state is invalid. + * + * <p>Maintenance of this state flag is branch-free for improved performance. It + * eliminates a conditional check for a negative count during remove/subtract + * operations and a conditional check for integer overflow during merge/add + * operations. + * + * <p>Note: Integer overflow is unlikely in realistic usage scenarios. A count + * that overflows indicates that the number of items in the filter exceeds the + * maximum possible size (number of bits) of any Bloom filter constrained by + * integer indices. At this point the filter is most likely full (all bits are + * non-zero) and thus useless. + * + * <p>Negative counts are a concern if the filter is used incorrectly by + * removing an item that was never added. It is expected that a user of a + * counting Bloom filter will not perform this action as it is a mistake. + * Enabling an explicit recovery path for negative or overflow counts is a major + * performance burden not deemed necessary for the unlikely scenarios when an + * invalid state is created. Maintenance of the state flag is a concession to + * flag improper use that should not have a major performance impact. + */ + private int state; + + /** + * An iterator of all indexes with non-zero counts. + * + * <p>In the event that the filter state is invalid any index with a negative count + * will also be produced by the iterator. + */ + private class IndexIterator implements PrimitiveIterator.OfInt { + /** The next non-zero index (or counts.length). */ + private int next; + + /** + * Create an instance. + */ + IndexIterator() { + advance(); + } + + /** + * Advance to the next non-zero index. + */ + void advance() { + while (next < counts.length && counts[next] == 0) { + next++; + } + } + + @Override + public boolean hasNext() { + return next < counts.length; + } + + @Override + public int nextInt() { + if (hasNext()) { + final int result = next++; + advance(); + return result; + } + // Currently unreachable as the iterator is only used by + // the StaticHasher which iterates correctly. + throw new NoSuchElementException(); + } + } + + /** + * Constructs an empty counting Bloom filter with the specified shape. + * + * @param shape the shape of the filter + */ + public ArrayCountingBloomFilter(final Shape shape) { + super(shape); + counts = new int[shape.getNumberOfBits()]; + } + + /** + * Constructs a counting Bloom filter from a hasher and a shape. + * + * <p>The filter will be equal to the result of merging the hasher with an empty + * filter; specifically duplicate indexes in the hasher are ignored. + * + * @param hasher the hasher to build the filter from + * @param shape the shape of the filter + * @throws IllegalArgumentException if the hasher cannot generate indices for + * the shape + * @see #merge(Hasher) + */ + public ArrayCountingBloomFilter(final Hasher hasher, final Shape shape) { + super(shape); + // Given the filter is empty we can optimise the operation of merge(hasher) + verifyHasher(hasher); + // Delay array allocation until after hasher is verified + counts = new int[shape.getNumberOfBits()]; + // All counts are zero. Ignore duplicates by initialising to 1 + hasher.getBits(shape).forEachRemaining((IntConsumer) idx -> counts[idx] = 1); + } + + @Override + public int cardinality() { + int size = 0; + for (final int c : counts) { + if (c != 0) { + size++; + } + } + return size; + } + + @Override + public boolean contains(BloomFilter other) { + // The AbstractBloomFilter implementation converts both filters to long[] bits. + // This would involve checking all indexes in this filter against zero. + // Ideally we use an iterator of bit indexes to allow fail-fast on the + // first bit index that is zero. + if (other instanceof ArrayCountingBloomFilter) { + verifyShape(other); + return contains(((ArrayCountingBloomFilter) other).iterator()); + } + + // Note: + // This currently creates a StaticHasher which stores all the indexes. + // It would greatly benefit from direct generation of the index iterator + // avoiding the intermediate storage. + return contains(other.getHasher()); + } + + @Override + public boolean contains(final Hasher hasher) { + verifyHasher(hasher); + return contains(hasher.getBits(getShape())); + } + + /** + * Return true if this filter is has non-zero counts for each index in the iterator. + * + * @param iter the iterator + * @return true if this filter contains all the indexes + */ + private boolean contains(final OfInt iter) { + while (iter.hasNext()) { + if (counts[iter.nextInt()] == 0) { + return false; + } + } + return true; + } + + @Override + public long[] getBits() { + final BitSet bs = new BitSet(); + for (int i = 0; i < counts.length; i++) { + if (counts[i] != 0) { + bs.set(i); + } + } + return bs.toLongArray(); + } + + @Override + public StaticHasher getHasher() { + return new StaticHasher(iterator(), getShape()); + } + + /** + * Returns an iterator over the enabled indexes in this filter. + * Any index with a non-zero count is considered enabled. + * The iterator returns indexes in their natural order. + * + * @return an iterator over the enabled indexes + */ + private PrimitiveIterator.OfInt iterator() { + return new IndexIterator(); + } + + @Override + public void merge(final BloomFilter other) { + applyAsBloomFilter(other, this::increment); + } + + @Override + public void merge(final Hasher hasher) { + applyAsHasher(hasher, this::increment); + } + + @Override + public boolean remove(BloomFilter other) { + applyAsBloomFilter(other, this::decrement); + return isValid(); + } + + @Override + public boolean remove(Hasher hasher) { + applyAsHasher(hasher, this::decrement); + return isValid(); + } + + @Override + public boolean add(CountingBloomFilter other) { + applyAsCountingBloomFilter(other, this::add); + return isValid(); + } + + @Override + public boolean subtract(CountingBloomFilter other) { + applyAsCountingBloomFilter(other, this::subtract); + return isValid(); + } + + /** + * {@inheritDoc} + * + * <p><em>Implementation note</em> + * + * <p>The state transition to invalid is permanent. + * + * <p>This implementation does not correct negative counts to zero or integer + * overflow counts to {@link Integer#MAX_VALUE}. Thus the operation that + * generated invalid counts can be reversed by using the complement of the + * original operation with the same Bloom filter. This will restore the counts + * to the state prior to the invalid operation. Counts can then be extracted + * using {@link #forEachCount(BitCountConsumer)}. + */ + @Override + public boolean isValid() { + return state >= 0; + } + + @Override + public void forEachCount(BitCountConsumer action) { + for (int i = 0; i < counts.length; i++) { + if (counts[i] != 0) { + action.accept(i, counts[i]); + } + } + } + + /** + * Apply the action for each index in the Bloom filter. + */ + private void applyAsBloomFilter(final BloomFilter other, final IntConsumer action) { + verifyShape(other); + if (other instanceof ArrayCountingBloomFilter) { + // Only use the presence of non-zero and not the counts + final int[] counts2 = ((ArrayCountingBloomFilter) other).counts; + for (int i = 0; i < counts2.length; i++) { + if (counts2[i] != 0) { + action.accept(i); + } + } + } else { + BitSet.valueOf(other.getBits()).stream().forEach(action); + } + } + + /** + * Apply the action for each index in the hasher. + */ + private void applyAsHasher(final Hasher hasher, final IntConsumer action) { + verifyHasher(hasher); + toSet(hasher).forEach(i -> action.accept(i)); + } + + /** + * Apply the action for each index in the Bloom filter. + */ + private void applyAsCountingBloomFilter(final CountingBloomFilter other, final BitCountConsumer action) { + verifyShape(other); + other.forEachCount(action); + } + + /** + * Increment to the count for the bit index. + * + * @param idx the index + */ + private void increment(int idx) { + final int updated = counts[idx] + 1; + state |= updated; + counts[idx] = updated; + } + + /** + * Decrement from the count for the bit index. + * + * @param idx the index + */ + private void decrement(int idx) { + final int updated = counts[idx] - 1; + state |= updated; + counts[idx] = updated; + } + + /** + * Add to the count for the bit index. + * + * @param idx the index + * @param addend the amount to add + */ + private void add(int idx, int addend) { + final int updated = counts[idx] + addend; + state |= updated; + counts[idx] = updated; + } + + /** + * Subtract from the count for the bit index. + * + * @param idx the index + * @param subtrahend the amount to subtract + */ + private void subtract(int idx, int subtrahend) { + final int updated = counts[idx] - subtrahend; + state |= updated; + counts[idx] = updated; + } + + /** + * Convert the hasher to a set. Duplicates indexes are removed. + * The unique indexes can be used to increment or decrement counts by 1. + * + * @param hasher the hasher + * @return the unique indexes + */ + private Set<Integer> toSet(Hasher hasher) { + final Set<Integer> lst = new HashSet<>(); + hasher.getBits(getShape()).forEachRemaining((Consumer<Integer>) lst::add); + return lst; + } +} diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/CountingBloomFilter.java b/src/main/java/org/apache/commons/collections4/bloomfilter/CountingBloomFilter.java new file mode 100644 index 0000000..8174d1e --- /dev/null +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/CountingBloomFilter.java @@ -0,0 +1,179 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter; + +import org.apache.commons.collections4.bloomfilter.hasher.Hasher; + +/** + * The interface that describes a Bloom filter that associates a count with each + * bit index to allow reversal of merge operations with remove operations. + * + * <p>A counting Bloom filter is expected to function identically to a standard + * Bloom filter that is the merge of all the Bloom filters that have been added + * to and not later subtracted from the counting Bloom filter. The functional + * state of a CountingBloomFilter at the start and end of a series of merge and + * subsequent remove operations of the same Bloom filters, irrespective of + * remove order, is expected to be the same. + * + * <p>Removal of a filter that has not previously been merged results in an + * invalid state where the counts no longer represent a sum of merged Bloom + * filters. It is impossible to validate merge and remove exactly without + * explicitly storing all filters. Consequently such an operation may go + * undetected. The CountingBloomFilter maintains a state flag that is used as a + * warning that an operation was performed that resulted in invalid counts and + * thus an invalid state. For example this may occur if a count for an index was + * set to negative following a remove operation. + * + * <p>Implementations should document the expected state of the filter after an + * operation that generates invalid counts, and any potential recovery options. + * An implementation may support a reversal of the operation to restore the + * state to that prior to the operation. In the event that invalid counts are + * adjusted to a valid range then it should be documented if there has been + * irreversible information loss. + * + * <p>Implementations may choose to throw an exception during an operation that + * generates invalid counts. Implementations should document the expected state + * of the filter after such an operation. For example are the counts not updated, + * partially updated or updated entirely before the exception is raised. + * + * @since 4.5 + */ +public interface CountingBloomFilter extends BloomFilter { + + /** + * Represents an operation that accepts an {@code <index, count>} pair representing + * the count for a bit index in a counting Bloom filter and returns no result. + * + * <p>Note: This is a functional interface as a primitive type specialization of + * {@link java.util.function.BiConsumer} for {@code int}. + */ + @FunctionalInterface + interface BitCountConsumer { + /** + * Performs this operation on the given {@code <index, count>} pair. + * + * @param index the bit index + * @param count the count at the specified bit index + */ + void accept(int index, int count); + } + + /** + * {@inheritDoc} + * + * <p>Note: If the hasher contains duplicate bit indexes these are ignored. + * All counts for the indexes identified by the other filter will be incremented by 1. + */ + @Override + void merge(BloomFilter other); + + /** + * {@inheritDoc} + * + * <p>Note: If the hasher contains duplicate bit indexes these are ignored. + * All counts for the indexes identified by the other filter will be incremented by 1. + */ + @Override + void merge(Hasher other); + + /** + * Removes the other Bloom filter from this one. + * All counts for the indexes identified by the other filter will be decremented by 1. + * + * <p>This method will return true if the filter is valid after the operation. + * + * @param other the other Bloom filter + * @return true if the removal was successful and the state is valid + * @throws IllegalArgumentException if the shape of the other filter does not match + * the shape of this filter + * @see #isValid() + */ + boolean remove(BloomFilter other); + + /** + * Removes the decomposed Bloom filter defined by the hasher from this Bloom filter. + * All counts for the indexes identified by the hasher will be decremented by 1. + * Duplicate indexes should be ignored. + * + * <p>This method will return true if the filter is valid after the operation. + * + * @param hasher the hasher to provide the indexes + * @return true if the removal was successful and the state is valid + * @throws IllegalArgumentException if the hasher cannot generate indices for the shape of + * this filter + * @see #isValid() + */ + boolean remove(Hasher hasher); + + /** + * Adds the other counting Bloom filter to this one. + * All counts for the indexes identified by the other filter will be incremented by their + * corresponding counts in the other filter. + * + * <p>This method will return true if the filter is valid after the operation. + * + * @param other the other counting Bloom filter + * @return true if the addition was successful and the state is valid + * @throws IllegalArgumentException if the shape of the other filter does not match + * the shape of this filter + * @see #isValid() + */ + boolean add(CountingBloomFilter other); + + /** + * Subtracts the other counting Bloom filter from this one. + * All counts for the indexes identified by the other filter will be decremented by their + * corresponding counts in the other filter. + * + * <p>This method will return true if the filter is valid after the operation. + * + * @param other the other counting Bloom filter + * @return true if the subtraction was successful and the state is valid + * @throws IllegalArgumentException if the shape of the other filter does not match + * the shape of this filter + * @see #isValid() + */ + boolean subtract(CountingBloomFilter other); + + /** + * Returns true if the internal state is valid. This flag is a warning that an addition or + * subtraction of counts from this filter resulted in an invalid count for one or more + * indexes. For example this may occur if a count for an index was + * set to negative following a subtraction operation, or overflows an {@code int} following an + * addition operation. + * + * <p>A counting Bloom filter that has an invalid state is no longer ensured to function + * identically to a standard Bloom filter instance that is the merge of all the Bloom filters + * that have been added to and not later subtracted from this counting Bloom filter. + * + * <p>Note: The change to an invalid state may or may not be reversible. Implementations + * are expected to document their policy on recovery from an addition or removal operation + * that generated an invalid state. + * + * @return true if the state is valid + */ + boolean isValid(); + + /** + * Performs the given action for each {@code <index, count>} pair where the count is non-zero. + * Any exceptions thrown by the action are relayed to the caller. + * + * @param action the action to be performed for each non-zero bit count + * @throws NullPointerException if the specified action is null + */ + void forEachCount(BitCountConsumer action); +} diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/ArrayCountingBloomFilterTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/ArrayCountingBloomFilterTest.java new file mode 100644 index 0000000..0de90c8 --- /dev/null +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/ArrayCountingBloomFilterTest.java @@ -0,0 +1,551 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; + +import java.util.Arrays; +import java.util.HashMap; +import java.util.Map; +import java.util.PrimitiveIterator.OfInt; +import java.util.concurrent.ThreadLocalRandom; +import java.util.function.BiConsumer; +import java.util.function.BiPredicate; +import java.util.function.Function; +import java.util.function.ToIntBiFunction; + +import org.apache.commons.collections4.bloomfilter.hasher.HashFunctionIdentity; +import org.apache.commons.collections4.bloomfilter.hasher.Hasher; +import org.apache.commons.collections4.bloomfilter.hasher.Shape; +import org.junit.Test; + +/** + * Tests for the {@link ArrayCountingBloomFilter}. + */ +public class ArrayCountingBloomFilterTest extends AbstractBloomFilterTest { + + /** + * A simple Hasher implementation to return indexes. Duplicates are allowed. + */ + private static class SimpleHasher implements Hasher { + private Shape shape; + private int[] indexes; + + SimpleHasher(Shape shape, int... indexes) { + this.shape = shape; + this.indexes = indexes; + } + + @Override + public OfInt getBits(Shape shape) { + // Assume shape is the same + return Arrays.stream(indexes).iterator(); + } + + @Override + public HashFunctionIdentity getHashFunctionIdentity() { + return shape.getHashFunctionIdentity(); + } + + @Override + public boolean isEmpty() { + return indexes.length == 0; + } + } + + @Override + protected ArrayCountingBloomFilter createEmptyFilter(final Shape shape) { + return new ArrayCountingBloomFilter(shape); + } + + @Override + protected ArrayCountingBloomFilter createFilter(final Hasher hasher, final Shape shape) { + return new ArrayCountingBloomFilter(hasher, shape); + } + + private ArrayCountingBloomFilter createFromCounts(int[] counts) { + // Use a dummy filter to add the counts to an empty filter + final CountingBloomFilter dummy = new ArrayCountingBloomFilter(shape) { + @Override + public void forEachCount(BitCountConsumer action) { + for (int i = 0; i < counts.length; i++) { + action.accept(i, counts[i]); + } + } + }; + final ArrayCountingBloomFilter bf = new ArrayCountingBloomFilter(shape); + bf.add(dummy); + return bf; + } + + /** + * Assert the counts match the expected values. Values are for indices starting + * at 0. Assert the cardinality equals the number of non-zero counts. + * + * @param bf the bloom filter + * @param expected the expected counts + */ + private static void assertCounts(CountingBloomFilter bf, int[] expected) { + final Map<Integer, Integer> m = new HashMap<>(); + bf.forEachCount(m::put); + int zeros = 0; + for (int i = 0; i < expected.length; i++) { + if (m.get(i) == null) { + assertEquals("Wrong value for " + i, expected[i], 0); + zeros++; + } else { + assertEquals("Wrong value for " + i, expected[i], m.get(i).intValue()); + } + } + assertEquals(expected.length - zeros, bf.cardinality()); + } + + /** + * Tests that counts are correct when a hasher with duplicates is used in the + * constructor. + */ + @Test + public void constructorTest_Hasher_Duplicates() { + final int[] expected = {0, 1, 1, 0, 0, 1}; + // Some indexes with duplicates + final Hasher hasher = new SimpleHasher(shape, 1, 2, 2, 5); + + final ArrayCountingBloomFilter bf = createFilter(hasher, shape); + final long[] lb = bf.getBits(); + assertEquals(1, lb.length); + assertEquals(0b100110L, lb[0]); + + assertCounts(bf, expected); + } + + /** + * Test the contains function with a standard Bloom filter. + * The contains function is tested using a counting Bloom filter in the parent test class. + */ + @Test + public void contains_BloomFilter() { + // Some indexes with duplicates + final Hasher hasher = new SimpleHasher(shape, 1, 2, 5); + final ArrayCountingBloomFilter bf = createFilter(hasher, shape); + assertFalse(bf.contains(new BitSetBloomFilter(new SimpleHasher(shape, 3, 4), shape))); + assertTrue(bf.contains(new BitSetBloomFilter(new SimpleHasher(shape, 2, 5), shape))); + } + + /** + * Tests that merge correctly updates the counts when a CountingBloomFilter is + * passed. + */ + @Test + public void mergeTest_Counts_CountingBloomFilter() { + assertMerge(counts -> createFilter(new SimpleHasher(shape, counts), shape), + BloomFilter::merge); + } + + /** + * Tests that merge correctly updates the counts when a BloomFilter is passed. + */ + @Test + public void mergeTest_Counts_BloomFilter() { + assertMerge(counts -> new BitSetBloomFilter(new SimpleHasher(shape, counts), shape), + BloomFilter::merge); + } + + /** + * Test that merge correctly updates the counts when a Hasher is passed. + */ + @Test + public void mergeTest_Counts_Hasher() { + assertMerge(counts -> new SimpleHasher(shape, counts), + BloomFilter::merge); + } + + /** + * Test that merge correctly updates the counts when a Hasher is passed with duplicates. + */ + @Test + public void mergeTest_Counts_Hasher_Duplicates() { + assertMerge(counts -> new SimpleHasher(shape, createDuplicates(counts)), + BloomFilter::merge); + } + + /** + * Tests that remove correctly updates the counts when a CountingBloomFilter is + * passed. + */ + @Test + public void removeTest_Counts_CountingBloomFilter() { + assertRemove(counts -> createFilter(new SimpleHasher(shape, counts), shape), + CountingBloomFilter::remove); + } + + /** + * Tests that remove correctly updates the counts when a BloomFilter is passed. + */ + @Test + public void removeTest_Counts_BloomFilter() { + assertRemove(counts -> new BitSetBloomFilter(new SimpleHasher(shape, counts), shape), + CountingBloomFilter::remove); + } + + /** + * Test that remove correctly updates the counts when a Hasher is passed. + */ + @Test + public void removeTest_Counts_Hasher() { + assertRemove(counts -> new SimpleHasher(shape, counts), + CountingBloomFilter::remove); + } + + /** + * Test that remove correctly updates the counts when a Hasher is passed with duplicates. + */ + @Test + public void removeTest_Counts_Hasher_Duplicates() { + assertRemove(counts -> new SimpleHasher(shape, createDuplicates(counts)), + CountingBloomFilter::remove); + } + + /** + * Creates duplicates in the counts. + * + * @param counts the counts + * @return the new counts + */ + private static int[] createDuplicates(int[] counts) { + // Duplicate some values randomly + final int length = counts.length; + int[] countsWithDuplicates = Arrays.copyOf(counts, 2 * length); + for (int i = length; i < countsWithDuplicates.length; i++) { + // Copy a random value from the counts into the end position + countsWithDuplicates[i] = countsWithDuplicates[ThreadLocalRandom.current().nextInt(i)]; + } + return countsWithDuplicates; + } + + /** + * Assert a merge operation. The converter should construct a suitable object + * to remove the indices from the provided Bloom filter with the remove operation. + * + * @param <F> the type of the filter + * @param converter the converter + * @param merge the merge operation + */ + private <F> void assertMerge(Function<int[], F> converter, + BiConsumer<ArrayCountingBloomFilter, F> merge) { + final int[] indexes1 = { 1, 2, 4, 5, 6}; + final int[] indexes2 = { 3, 4, 6}; + final int[] expected = {0, 1, 1, 1, 2, 1, 2}; + assertOperation(indexes1, indexes2, converter, merge, expected); + } + + /** + * Assert a remove operation. The converter should construct a suitable object + * to remove the indices from the provided Bloom filter with the remove operation. + * + * @param <F> the type of the filter + * @param converter the converter + * @param remove the remove operation + */ + private <F> void assertRemove(Function<int[], F> converter, + BiConsumer<ArrayCountingBloomFilter, F> remove) { + final int[] indexes1 = { 1, 2, 4, 5, 6}; + final int[] indexes2 = { 2, 5, 6}; + final int[] expected = {0, 1, 0, 0, 1, 0, 0}; + assertOperation(indexes1, indexes2, converter, remove, expected); + } + + /** + * Assert a counting operation. The first set of indexes is used to create the + * CountingBloomFilter. The second set of indices is passed to the converter to + * construct a suitable object to combine with the counting Bloom filter. The counts + * of the first Bloom filter are checked using the expected counts. + * + * <p>Counts are assumed to map to indexes starting from 0. + * + * @param <F> the type of the filter + * @param indexes1 the first set of indexes + * @param indexes2 the second set of indexes + * @param converter the converter + * @param operation the operation + * @param expected the expected counts after the operation + */ + private <F> void assertOperation(int[] indexes1, int[] indexes2, + Function<int[], F> converter, + BiConsumer<ArrayCountingBloomFilter, F> operation, + int[] expected) { + final Hasher hasher = new SimpleHasher(shape, indexes1); + final ArrayCountingBloomFilter bf = createFilter(hasher, shape); + final F filter = converter.apply(indexes2); + operation.accept(bf, filter); + assertCounts(bf, expected); + } + + /** + * Tests that merge errors when the counts overflow the maximum integer value. + */ + @Test + public void mergeTest_Overflow() { + final Hasher hasher = new SimpleHasher(shape, 1, 2, 3); + final ArrayCountingBloomFilter bf = createFilter(hasher, shape); + + ArrayCountingBloomFilter bf2 = createFromCounts(new int[] {0, 0, Integer.MAX_VALUE}); + + // Small + 1 = OK + // should not fail as the counts are ignored + bf.merge(bf2); + assertTrue(bf.isValid()); + assertCounts(bf, new int[] {0, 1, 2, 1}); + + // Big + 1 = Overflow + assertTrue(bf2.isValid()); + bf2.merge(bf); + assertFalse("Merge should overflow and the filter is invalid", bf2.isValid()); + + // The counts are not clipped to max. They have simply overflowed. + // Note that this is a merge and the count is only incremented by 1 + // and not the actual count at each index. So it is not 2 + Integer.MAX_VALUE. + assertCounts(bf2, new int[] {0, 1, 1 + Integer.MAX_VALUE, 1}); + } + + /** + * Tests that removal errors when the counts become negative. + */ + @Test + public void removeTest_Negative() { + final Hasher hasher = new SimpleHasher(shape, 1, 2, 3); + ArrayCountingBloomFilter bf = createFilter(hasher, shape); + + final Hasher hasher2 = new SimpleHasher(shape, 2); + ArrayCountingBloomFilter bf2 = createFilter(hasher2, shape); + + // More - Less = OK + bf.remove(bf2); + assertTrue(bf.isValid()); + assertCounts(bf, new int[] {0, 1, 0, 1}); + + // Less - More = Negative + assertTrue(bf2.isValid()); + bf2.remove(bf); + assertFalse("Remove should create negative counts and the filter is invalid", bf2.isValid()); + + // The counts are not clipped to zero. They have been left as negative. + assertCounts(bf2, new int[] {0, -1, 1, -1}); + } + + /** + * Tests that counts can be added to a new instance. + * + * <p>Note: This test ensures the CountinBloomFilter + * can be created with whatever counts are required for other tests. + */ + @Test + public void addTest_NewInstance() { + for (final int[] counts : new int[][] { + { /* empty */}, + {0, 0, 1}, + {0, 1, 2}, + {2, 3, 4}, + {66, 77, 0, 99}, + {Integer.MAX_VALUE, 42}, + }) { + assertCounts(createFromCounts(counts), counts); + } + } + + /** + * Test that add correctly ignores an empty CountingBloomFilter. + */ + @Test + public void addTest_Empty() { + assertCountingOperation(new int[] {5, 2, 1}, + new int[0], + CountingBloomFilter::add, + true, + new int[] {5, 2, 1}); + } + + /** + * Test that add correctly updates the counts when a CountingBloomFilter is + * passed. + */ + @Test + public void addTest_Counts() { + assertCountingOperation(new int[] {5, 2, 1}, + new int[] {0, 6, 4, 1}, + CountingBloomFilter::add, + true, + new int[] {5, 8, 5, 1}); + } + + /** + * Test that add correctly updates the isValid state when a CountingBloomFilter is + * passed and an integer overflow occurs. + */ + @Test + public void addTest_Overflow() { + assertCountingOperation(new int[] {5, 2, 1}, + new int[] {0, 6, Integer.MAX_VALUE}, + CountingBloomFilter::add, + false, + new int[] {5, 8, 1 + Integer.MAX_VALUE}); + } + + /** + * Test that subtract correctly ignores an empty CountingBloomFilter. + */ + @Test + public void subtractTest_Empty() { + assertCountingOperation(new int[] {5, 2, 1}, + new int[0], + CountingBloomFilter::subtract, + true, + new int[] {5, 2, 1}); + } + + /** + * Test that subtract correctly updates the counts when a CountingBloomFilter is + * passed. + */ + @Test + public void subtractTest_Counts() { + assertCountingOperation(new int[] {5, 9, 1, 1}, + new int[] {0, 2, 1}, + CountingBloomFilter::subtract, + true, + new int[] {5, 7, 0, 1}); + } + + /** + * Test that subtract correctly updates the isValid state when a CountingBloomFilter is + * passed and the counts become negative. + */ + @Test + public void subtractTest_Negative() { + assertCountingOperation(new int[] {5, 2, 1}, + new int[] {0, 6, 1}, + CountingBloomFilter::subtract, + false, + new int[] {5, -4, 0}); + } + + /** + * Assert a counting operation. Two CountingBloomFilters are created from the + * two sets of counts. The operation is applied and the counts of the first + * Bloom filter is checked using the expected counts. + * + * <p>Counts are assumed to map to indexes starting from 0. + * + * @param counts1 the first set counts + * @param counts2 the first set counts + * @param operation the operation + * @param isValid the expected value for the operation result + * @param expected the expected counts after the operation + */ + private void assertCountingOperation(int[] counts1, int[] counts2, + BiPredicate<ArrayCountingBloomFilter, ArrayCountingBloomFilter> operation, + boolean isValid, int[] expected) { + final ArrayCountingBloomFilter bf1 = createFromCounts(counts1); + final ArrayCountingBloomFilter bf2 = createFromCounts(counts2); + final boolean result = operation.test(bf1, bf2); + assertEquals(isValid, result); + assertEquals(isValid, bf1.isValid()); + assertCounts(bf1, expected); + } + + /** + * Tests that the andCardinality calculation executes correctly when using a + * CountingBloomFilter argument. + */ + @Test + public void andCardinalityTest_CountingBloomFilter() { + assertCardinalityOperation(new int[] {1, 1}, + new int[] {1, 1}, + BloomFilter::andCardinality, + 2); + assertCardinalityOperation(new int[] {0, 1, 0, 1, 1, 1, 0, 1, 0}, + new int[] {1, 1, 0, 0, 0, 1}, + BloomFilter::andCardinality, + 2); + assertCardinalityOperation(new int[] {1, 1}, + new int[] {0, 0, 1, 1, 1}, + BloomFilter::andCardinality, + 0); + } + + /** + * Tests that the orCardinality calculation executes correctly when using a + * CountingBloomFilter argument. + */ + @Test + public void orCardinalityTest_CountingBloomFilter() { + assertCardinalityOperation(new int[] {1, 1}, + new int[] {1, 1}, + BloomFilter::orCardinality, + 2); + assertCardinalityOperation(new int[] {0, 1, 0, 1, 1, 1, 0, 1, 0}, + new int[] {1, 1, 0, 0, 0, 1}, + BloomFilter::orCardinality, + 6); + assertCardinalityOperation(new int[] {1, 1}, + new int[] {0, 0, 1, 1, 1}, + BloomFilter::orCardinality, + 5); + } + + /** + * Tests that the xorCardinality calculation executes correctly when using a + * CountingBloomFilter argument. + */ + @Test + public void xorCardinalityTest_CountingBloomFilter() { + assertCardinalityOperation(new int[] {1, 1}, + new int[] {1, 1}, + BloomFilter::xorCardinality, + 0); + assertCardinalityOperation(new int[] {0, 1, 0, 1, 1, 1, 0, 1, 0}, + new int[] {1, 1, 0, 0, 0, 1}, + BloomFilter::xorCardinality, + 4); + assertCardinalityOperation(new int[] {1, 1}, + new int[] {0, 0, 1, 1, 1}, + BloomFilter::xorCardinality, + 5); + } + + /** + * Assert a cardinality operation. Two CountingBloomFilters are created from the + * two sets of counts. The operation is applied and the counts of the first + * Bloom filter is checked using the expected counts. + * + * <p>Counts are assumed to map to indexes starting from 0. + * + * @param counts1 the first set counts + * @param counts2 the first set counts + * @param operation the operation + * @param expected the expected cardinality + */ + private void assertCardinalityOperation(int[] counts1, int[] counts2, + ToIntBiFunction<ArrayCountingBloomFilter, ArrayCountingBloomFilter> operation, + int expected) { + final ArrayCountingBloomFilter bf1 = createFromCounts(counts1); + final ArrayCountingBloomFilter bf2 = createFromCounts(counts2); + assertEquals(expected, operation.applyAsInt(bf1, bf2)); + assertEquals(expected, operation.applyAsInt(bf2, bf1)); + } +}