[commons-collections] 02/04: Added CountingBloomFilter interface and ArrayCountingBloomFilter.

aherbert Sat, 14 Mar 2020 11:34:51 -0700

This is an automated email from the ASF dual-hosted git repository.

aherbert pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/commons-collections.git


commit fb358a5c80a33e2e4ea46bba57a31148bc710f86
Author: Alex Herbert <aherb...@apache.org>
AuthorDate: Mon Mar 2 22:25:45 2020 +0000

    Added CountingBloomFilter interface and ArrayCountingBloomFilter.
---
 .../bloomfilter/ArrayCountingBloomFilter.java      | 396 +++++++++++++++
 .../bloomfilter/CountingBloomFilter.java           | 179 +++++++
 .../bloomfilter/ArrayCountingBloomFilterTest.java  | 551 +++++++++++++++++++++
 3 files changed, 1126 insertions(+)

diff --git 
a/src/main/java/org/apache/commons/collections4/bloomfilter/ArrayCountingBloomFilter.java
 
b/src/main/java/org/apache/commons/collections4/bloomfilter/ArrayCountingBloomFilter.java
new file mode 100644
index 0000000..9538f32
--- /dev/null
+++ 
b/src/main/java/org/apache/commons/collections4/bloomfilter/ArrayCountingBloomFilter.java
@@ -0,0 +1,396 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.collections4.bloomfilter;
+
+import java.util.BitSet;
+import java.util.HashSet;
+import java.util.NoSuchElementException;
+import java.util.PrimitiveIterator;
+import java.util.PrimitiveIterator.OfInt;
+import java.util.function.Consumer;
+import java.util.function.IntConsumer;
+import java.util.Set;
+
+import org.apache.commons.collections4.bloomfilter.hasher.Hasher;
+import org.apache.commons.collections4.bloomfilter.hasher.Shape;
+import org.apache.commons.collections4.bloomfilter.hasher.StaticHasher;
+
+/**
+ * A counting Bloom filter using an array to track counts for each enabled bit
+ * index.
+ *
+ * <p>Any operation that results in negative counts or integer overflow of 
counts will
+ * mark this filter as invalid. This transition is not reversible. The counts 
for the
+ * filter immediately prior to the operation that create invalid counts can be 
recovered.
+ * See the documentation in {@link #isValid()} for details.
+ *
+ * <p>All the operations in the filter assume the counts are currently valid. 
Behaviour
+ * of an invalid filter is undefined. It will no longer function identically 
to a standard
+ * Bloom filter that is the merge of all the Bloom filters that have been added
+ * to and not later subtracted from the counting Bloom filter.
+ *
+ * <p>The maximum supported number of items that can be stored in the filter is
+ * limited by the maximum array size combined with the {@link Shape}. For
+ * example an implementation using a {@link Shape} with a false-positive
+ * probability of 1e-6 and {@link Integer#MAX_VALUE} bits can reversibly store
+ * approximately 75 million items using 20 hash functions per item with a 
memory
+ * consumption of approximately 8 GB.
+ *
+ * @since 4.5
+ * @see Shape
+ */
+public class ArrayCountingBloomFilter extends AbstractBloomFilter implements 
CountingBloomFilter {
+
+    /**
+     * The count of each bit index in the filter.
+     */
+    private final int[] counts;
+
+    /**
+     * The state flag. This is a bitwise OR of the entire history of all 
updated
+     * counts. If negative then a negative count or integer overflow has 
occurred on
+     * one or more counts in the history of the filter and the state is 
invalid.
+     *
+     * <p>Maintenance of this state flag is branch-free for improved 
performance. It
+     * eliminates a conditional check for a negative count during 
remove/subtract
+     * operations and a conditional check for integer overflow during merge/add
+     * operations.
+     *
+     * <p>Note: Integer overflow is unlikely in realistic usage scenarios. A 
count
+     * that overflows indicates that the number of items in the filter exceeds 
the
+     * maximum possible size (number of bits) of any Bloom filter constrained 
by
+     * integer indices. At this point the filter is most likely full (all bits 
are
+     * non-zero) and thus useless.
+     *
+     * <p>Negative counts are a concern if the filter is used incorrectly by
+     * removing an item that was never added. It is expected that a user of a
+     * counting Bloom filter will not perform this action as it is a mistake.
+     * Enabling an explicit recovery path for negative or overflow counts is a 
major
+     * performance burden not deemed necessary for the unlikely scenarios when 
an
+     * invalid state is created. Maintenance of the state flag is a concession 
to
+     * flag improper use that should not have a major performance impact.
+     */
+    private int state;
+
+    /**
+     * An iterator of all indexes with non-zero counts.
+     *
+     * <p>In the event that the filter state is invalid any index with a 
negative count
+     * will also be produced by the iterator.
+     */
+    private class IndexIterator implements PrimitiveIterator.OfInt {
+        /** The next non-zero index (or counts.length). */
+        private int next;
+
+        /**
+         * Create an instance.
+         */
+        IndexIterator() {
+            advance();
+        }
+
+        /**
+         * Advance to the next non-zero index.
+         */
+        void advance() {
+            while (next < counts.length && counts[next] == 0) {
+                next++;
+            }
+        }
+
+        @Override
+        public boolean hasNext() {
+            return next < counts.length;
+        }
+
+        @Override
+        public int nextInt() {
+            if (hasNext()) {
+                final int result = next++;
+                advance();
+                return result;
+            }
+            // Currently unreachable as the iterator is only used by
+            // the StaticHasher which iterates correctly.
+            throw new NoSuchElementException();
+        }
+    }
+
+    /**
+     * Constructs an empty counting Bloom filter with the specified shape.
+     *
+     * @param shape the shape of the filter
+     */
+    public ArrayCountingBloomFilter(final Shape shape) {
+        super(shape);
+        counts = new int[shape.getNumberOfBits()];
+    }
+
+    /**
+     * Constructs a counting Bloom filter from a hasher and a shape.
+     *
+     * <p>The filter will be equal to the result of merging the hasher with an 
empty
+     * filter; specifically duplicate indexes in the hasher are ignored.
+     *
+     * @param hasher the hasher to build the filter from
+     * @param shape the shape of the filter
+     * @throws IllegalArgumentException if the hasher cannot generate indices 
for
+     * the shape
+     * @see #merge(Hasher)
+     */
+    public ArrayCountingBloomFilter(final Hasher hasher, final Shape shape) {
+        super(shape);
+        // Given the filter is empty we can optimise the operation of 
merge(hasher)
+        verifyHasher(hasher);
+        // Delay array allocation until after hasher is verified
+        counts = new int[shape.getNumberOfBits()];
+        // All counts are zero. Ignore duplicates by initialising to 1
+        hasher.getBits(shape).forEachRemaining((IntConsumer) idx -> 
counts[idx] = 1);
+    }
+
+    @Override
+    public int cardinality() {
+        int size = 0;
+        for (final int c : counts) {
+            if (c != 0) {
+                size++;
+            }
+        }
+        return size;
+    }
+
+    @Override
+    public boolean contains(BloomFilter other) {
+        // The AbstractBloomFilter implementation converts both filters to 
long[] bits.
+        // This would involve checking all indexes in this filter against zero.
+        // Ideally we use an iterator of bit indexes to allow fail-fast on the
+        // first bit index that is zero.
+        if (other instanceof ArrayCountingBloomFilter) {
+            verifyShape(other);
+            return contains(((ArrayCountingBloomFilter) other).iterator());
+        }
+
+        // Note:
+        // This currently creates a StaticHasher which stores all the indexes.
+        // It would greatly benefit from direct generation of the index 
iterator
+        // avoiding the intermediate storage.
+        return contains(other.getHasher());
+    }
+
+    @Override
+    public boolean contains(final Hasher hasher) {
+        verifyHasher(hasher);
+        return contains(hasher.getBits(getShape()));
+    }
+
+    /**
+     * Return true if this filter is has non-zero counts for each index in the 
iterator.
+     *
+     * @param iter the iterator
+     * @return true if this filter contains all the indexes
+     */
+    private boolean contains(final OfInt iter) {
+        while (iter.hasNext()) {
+            if (counts[iter.nextInt()] == 0) {
+                return false;
+            }
+        }
+        return true;
+    }
+
+    @Override
+    public long[] getBits() {
+        final BitSet bs = new BitSet();
+        for (int i = 0; i < counts.length; i++) {
+            if (counts[i] != 0) {
+                bs.set(i);
+            }
+        }
+        return bs.toLongArray();
+    }
+
+    @Override
+    public StaticHasher getHasher() {
+        return new StaticHasher(iterator(), getShape());
+    }
+
+    /**
+     * Returns an iterator over the enabled indexes in this filter.
+     * Any index with a non-zero count is considered enabled.
+     * The iterator returns indexes in their natural order.
+     *
+     * @return an iterator over the enabled indexes
+     */
+    private PrimitiveIterator.OfInt iterator() {
+        return new IndexIterator();
+    }
+
+    @Override
+    public void merge(final BloomFilter other) {
+        applyAsBloomFilter(other, this::increment);
+    }
+
+    @Override
+    public void merge(final Hasher hasher) {
+        applyAsHasher(hasher, this::increment);
+    }
+
+    @Override
+    public boolean remove(BloomFilter other) {
+        applyAsBloomFilter(other, this::decrement);
+        return isValid();
+    }
+
+    @Override
+    public boolean remove(Hasher hasher) {
+        applyAsHasher(hasher, this::decrement);
+        return isValid();
+    }
+
+    @Override
+    public boolean add(CountingBloomFilter other) {
+        applyAsCountingBloomFilter(other, this::add);
+        return isValid();
+    }
+
+    @Override
+    public boolean subtract(CountingBloomFilter other) {
+        applyAsCountingBloomFilter(other, this::subtract);
+        return isValid();
+    }
+
+    /**
+     * {@inheritDoc}
+     *
+     * <p><em>Implementation note</em>
+     *
+     * <p>The state transition to invalid is permanent.
+     *
+     * <p>This implementation does not correct negative counts to zero or 
integer
+     * overflow counts to {@link Integer#MAX_VALUE}. Thus the operation that
+     * generated invalid counts can be reversed by using the complement of the
+     * original operation with the same Bloom filter. This will restore the 
counts
+     * to the state prior to the invalid operation. Counts can then be 
extracted
+     * using {@link #forEachCount(BitCountConsumer)}.
+     */
+    @Override
+    public boolean isValid() {
+        return state >= 0;
+    }
+
+    @Override
+    public void forEachCount(BitCountConsumer action) {
+        for (int i = 0; i < counts.length; i++) {
+            if (counts[i] != 0) {
+                action.accept(i, counts[i]);
+            }
+        }
+    }
+
+    /**
+     * Apply the action for each index in the Bloom filter.
+     */
+    private void applyAsBloomFilter(final BloomFilter other, final IntConsumer 
action) {
+        verifyShape(other);
+        if (other instanceof ArrayCountingBloomFilter) {
+            // Only use the presence of non-zero and not the counts
+            final int[] counts2 = ((ArrayCountingBloomFilter) other).counts;
+            for (int i = 0; i < counts2.length; i++) {
+                if (counts2[i] != 0) {
+                    action.accept(i);
+                }
+            }
+        } else {
+            BitSet.valueOf(other.getBits()).stream().forEach(action);
+        }
+    }
+
+    /**
+     * Apply the action for each index in the hasher.
+     */
+    private void applyAsHasher(final Hasher hasher, final IntConsumer action) {
+        verifyHasher(hasher);
+        toSet(hasher).forEach(i -> action.accept(i));
+    }
+
+    /**
+     * Apply the action for each index in the Bloom filter.
+     */
+    private void applyAsCountingBloomFilter(final CountingBloomFilter other, 
final BitCountConsumer action) {
+        verifyShape(other);
+        other.forEachCount(action);
+    }
+
+    /**
+     * Increment to the count for the bit index.
+     *
+     * @param idx the index
+     */
+    private void increment(int idx) {
+        final int updated = counts[idx] + 1;
+        state |= updated;
+        counts[idx] = updated;
+    }
+
+    /**
+     * Decrement from the count for the bit index.
+     *
+     * @param idx the index
+     */
+    private void decrement(int idx) {
+        final int updated = counts[idx] - 1;
+        state |= updated;
+        counts[idx] = updated;
+    }
+
+    /**
+     * Add to the count for the bit index.
+     *
+     * @param idx the index
+     * @param addend the amount to add
+     */
+    private void add(int idx, int addend) {
+        final int updated = counts[idx] + addend;
+        state |= updated;
+        counts[idx] = updated;
+    }
+
+    /**
+     * Subtract from the count for the bit index.
+     *
+     * @param idx the index
+     * @param subtrahend the amount to subtract
+     */
+    private void subtract(int idx, int subtrahend) {
+        final int updated = counts[idx] - subtrahend;
+        state |= updated;
+        counts[idx] = updated;
+    }
+
+    /**
+     * Convert the hasher to a set. Duplicates indexes are removed.
+     * The unique indexes can be used to increment or decrement counts by 1.
+     *
+     * @param hasher the hasher
+     * @return the unique indexes
+     */
+    private Set<Integer> toSet(Hasher hasher) {
+        final Set<Integer> lst = new HashSet<>();
+        hasher.getBits(getShape()).forEachRemaining((Consumer<Integer>) 
lst::add);
+        return lst;
+    }
+}
diff --git 
a/src/main/java/org/apache/commons/collections4/bloomfilter/CountingBloomFilter.java
 
b/src/main/java/org/apache/commons/collections4/bloomfilter/CountingBloomFilter.java
new file mode 100644
index 0000000..8174d1e
--- /dev/null
+++ 
b/src/main/java/org/apache/commons/collections4/bloomfilter/CountingBloomFilter.java
@@ -0,0 +1,179 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.collections4.bloomfilter;
+
+import org.apache.commons.collections4.bloomfilter.hasher.Hasher;
+
+/**
+ * The interface that describes a Bloom filter that associates a count with 
each
+ * bit index to allow reversal of merge operations with remove operations.
+ *
+ * <p>A counting Bloom filter is expected to function identically to a standard
+ * Bloom filter that is the merge of all the Bloom filters that have been added
+ * to and not later subtracted from the counting Bloom filter. The functional
+ * state of a CountingBloomFilter at the start and end of a series of merge and
+ * subsequent remove operations of the same Bloom filters, irrespective of
+ * remove order, is expected to be the same.
+ *
+ * <p>Removal of a filter that has not previously been merged results in an
+ * invalid state where the counts no longer represent a sum of merged Bloom
+ * filters. It is impossible to validate merge and remove exactly without
+ * explicitly storing all filters. Consequently such an operation may go
+ * undetected. The CountingBloomFilter maintains a state flag that is used as a
+ * warning that an operation was performed that resulted in invalid counts and
+ * thus an invalid state. For example this may occur if a count for an index 
was
+ * set to negative following a remove operation.
+ *
+ * <p>Implementations should document the expected state of the filter after an
+ * operation that generates invalid counts, and any potential recovery options.
+ * An implementation may support a reversal of the operation to restore the
+ * state to that prior to the operation. In the event that invalid counts are
+ * adjusted to a valid range then it should be documented if there has been
+ * irreversible information loss.
+ *
+ * <p>Implementations may choose to throw an exception during an operation that
+ * generates invalid counts. Implementations should document the expected state
+ * of the filter after such an operation. For example are the counts not 
updated,
+ * partially updated or updated entirely before the exception is raised.
+ *
+ * @since 4.5
+ */
+public interface CountingBloomFilter extends BloomFilter {
+
+    /**
+     * Represents an operation that accepts an {@code <index, count>} pair 
representing
+     * the count for a bit index in a counting Bloom filter and returns no 
result.
+     *
+     * <p>Note: This is a functional interface as a primitive type 
specialization of
+     * {@link java.util.function.BiConsumer} for {@code int}.
+     */
+    @FunctionalInterface
+    interface BitCountConsumer {
+        /**
+         * Performs this operation on the given {@code <index, count>} pair.
+         *
+         * @param index the bit index
+         * @param count the count at the specified bit index
+         */
+        void accept(int index, int count);
+    }
+
+    /**
+     * {@inheritDoc}
+     *
+     * <p>Note: If the hasher contains duplicate bit indexes these are ignored.
+     * All counts for the indexes identified by the other filter will be 
incremented by 1.
+     */
+    @Override
+    void merge(BloomFilter other);
+
+    /**
+     * {@inheritDoc}
+     *
+     * <p>Note: If the hasher contains duplicate bit indexes these are ignored.
+     * All counts for the indexes identified by the other filter will be 
incremented by 1.
+     */
+    @Override
+    void merge(Hasher other);
+
+    /**
+     * Removes the other Bloom filter from this one.
+     * All counts for the indexes identified by the other filter will be 
decremented by 1.
+     *
+     * <p>This method will return true if the filter is valid after the 
operation.
+     *
+     * @param other the other Bloom filter
+     * @return true if the removal was successful and the state is valid
+     * @throws IllegalArgumentException if the shape of the other filter does 
not match
+     * the shape of this filter
+     * @see #isValid()
+     */
+    boolean remove(BloomFilter other);
+
+    /**
+     * Removes the decomposed Bloom filter defined by the hasher from this 
Bloom filter.
+     * All counts for the indexes identified by the hasher will be decremented 
by 1.
+     * Duplicate indexes should be ignored.
+     *
+     * <p>This method will return true if the filter is valid after the 
operation.
+     *
+     * @param hasher the hasher to provide the indexes
+     * @return true if the removal was successful and the state is valid
+     * @throws IllegalArgumentException if the hasher cannot generate indices 
for the shape of
+     * this filter
+     * @see #isValid()
+     */
+    boolean remove(Hasher hasher);
+
+    /**
+     * Adds the other counting Bloom filter to this one.
+     * All counts for the indexes identified by the other filter will be 
incremented by their
+     * corresponding counts in the other filter.
+     *
+     * <p>This method will return true if the filter is valid after the 
operation.
+     *
+     * @param other the other counting Bloom filter
+     * @return true if the addition was successful and the state is valid
+     * @throws IllegalArgumentException if the shape of the other filter does 
not match
+     * the shape of this filter
+     * @see #isValid()
+     */
+    boolean add(CountingBloomFilter other);
+
+    /**
+     * Subtracts the other counting Bloom filter from this one.
+     * All counts for the indexes identified by the other filter will be 
decremented by their
+     * corresponding counts in the other filter.
+     *
+     * <p>This method will return true if the filter is valid after the 
operation.
+     *
+     * @param other the other counting Bloom filter
+     * @return true if the subtraction was successful and the state is valid
+     * @throws IllegalArgumentException if the shape of the other filter does 
not match
+     * the shape of this filter
+     * @see #isValid()
+     */
+    boolean subtract(CountingBloomFilter other);
+
+    /**
+     * Returns true if the internal state is valid. This flag is a warning 
that an addition or
+     * subtraction of counts from this filter resulted in an invalid count for 
one or more
+     * indexes. For example this may occur if a count for an index was
+     * set to negative following a subtraction operation, or overflows an 
{@code int} following an
+     * addition operation.
+     *
+     * <p>A counting Bloom filter that has an invalid state is no longer 
ensured to function
+     * identically to a standard Bloom filter instance that is the merge of 
all the Bloom filters
+     * that have been added to and not later subtracted from this counting 
Bloom filter.
+     *
+     * <p>Note: The change to an invalid state may or may not be reversible. 
Implementations
+     * are expected to document their policy on recovery from an addition or 
removal operation
+     * that generated an invalid state.
+     *
+     * @return true if the state is valid
+     */
+    boolean isValid();
+
+    /**
+     * Performs the given action for each {@code <index, count>} pair where 
the count is non-zero.
+     * Any exceptions thrown by the action are relayed to the caller.
+     *
+     * @param action the action to be performed for each non-zero bit count
+     * @throws NullPointerException if the specified action is null
+     */
+    void forEachCount(BitCountConsumer action);
+}
diff --git 
a/src/test/java/org/apache/commons/collections4/bloomfilter/ArrayCountingBloomFilterTest.java
 
b/src/test/java/org/apache/commons/collections4/bloomfilter/ArrayCountingBloomFilterTest.java
new file mode 100644
index 0000000..0de90c8
--- /dev/null
+++ 
b/src/test/java/org/apache/commons/collections4/bloomfilter/ArrayCountingBloomFilterTest.java
@@ -0,0 +1,551 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.collections4.bloomfilter;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
+
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.PrimitiveIterator.OfInt;
+import java.util.concurrent.ThreadLocalRandom;
+import java.util.function.BiConsumer;
+import java.util.function.BiPredicate;
+import java.util.function.Function;
+import java.util.function.ToIntBiFunction;
+
+import org.apache.commons.collections4.bloomfilter.hasher.HashFunctionIdentity;
+import org.apache.commons.collections4.bloomfilter.hasher.Hasher;
+import org.apache.commons.collections4.bloomfilter.hasher.Shape;
+import org.junit.Test;
+
+/**
+ * Tests for the {@link ArrayCountingBloomFilter}.
+ */
+public class ArrayCountingBloomFilterTest extends AbstractBloomFilterTest {
+
+    /**
+     * A simple Hasher implementation to return indexes. Duplicates are 
allowed.
+     */
+    private static class SimpleHasher implements Hasher {
+        private Shape shape;
+        private int[] indexes;
+
+        SimpleHasher(Shape shape, int... indexes) {
+            this.shape = shape;
+            this.indexes = indexes;
+        }
+
+        @Override
+        public OfInt getBits(Shape shape) {
+            // Assume shape is the same
+            return Arrays.stream(indexes).iterator();
+        }
+
+        @Override
+        public HashFunctionIdentity getHashFunctionIdentity() {
+            return shape.getHashFunctionIdentity();
+        }
+
+        @Override
+        public boolean isEmpty() {
+            return indexes.length == 0;
+        }
+    }
+
+    @Override
+    protected ArrayCountingBloomFilter createEmptyFilter(final Shape shape) {
+        return new ArrayCountingBloomFilter(shape);
+    }
+
+    @Override
+    protected ArrayCountingBloomFilter createFilter(final Hasher hasher, final 
Shape shape) {
+        return new ArrayCountingBloomFilter(hasher, shape);
+    }
+
+    private ArrayCountingBloomFilter createFromCounts(int[] counts) {
+        // Use a dummy filter to add the counts to an empty filter
+        final CountingBloomFilter dummy = new ArrayCountingBloomFilter(shape) {
+            @Override
+            public void forEachCount(BitCountConsumer action) {
+                for (int i = 0; i < counts.length; i++) {
+                    action.accept(i, counts[i]);
+                }
+            }
+        };
+        final ArrayCountingBloomFilter bf = new 
ArrayCountingBloomFilter(shape);
+        bf.add(dummy);
+        return bf;
+    }
+
+    /**
+     * Assert the counts match the expected values. Values are for indices 
starting
+     * at 0. Assert the cardinality equals the number of non-zero counts.
+     *
+     * @param bf the bloom filter
+     * @param expected the expected counts
+     */
+    private static void assertCounts(CountingBloomFilter bf, int[] expected) {
+        final Map<Integer, Integer> m = new HashMap<>();
+        bf.forEachCount(m::put);
+        int zeros = 0;
+        for (int i = 0; i < expected.length; i++) {
+            if (m.get(i) == null) {
+                assertEquals("Wrong value for " + i, expected[i], 0);
+                zeros++;
+            } else {
+                assertEquals("Wrong value for " + i, expected[i], 
m.get(i).intValue());
+            }
+        }
+        assertEquals(expected.length - zeros, bf.cardinality());
+    }
+
+    /**
+     * Tests that counts are correct when a hasher with duplicates is used in 
the
+     * constructor.
+     */
+    @Test
+    public void constructorTest_Hasher_Duplicates() {
+        final int[] expected = {0, 1, 1, 0, 0, 1};
+        // Some indexes with duplicates
+        final Hasher hasher = new SimpleHasher(shape, 1, 2, 2, 5);
+
+        final ArrayCountingBloomFilter bf = createFilter(hasher, shape);
+        final long[] lb = bf.getBits();
+        assertEquals(1, lb.length);
+        assertEquals(0b100110L, lb[0]);
+
+        assertCounts(bf, expected);
+    }
+
+    /**
+     * Test the contains function with a standard Bloom filter.
+     * The contains function is tested using a counting Bloom filter in the 
parent test class.
+     */
+    @Test
+    public void contains_BloomFilter() {
+        // Some indexes with duplicates
+        final Hasher hasher = new SimpleHasher(shape, 1, 2, 5);
+        final ArrayCountingBloomFilter bf = createFilter(hasher, shape);
+        assertFalse(bf.contains(new BitSetBloomFilter(new SimpleHasher(shape, 
3, 4), shape)));
+        assertTrue(bf.contains(new BitSetBloomFilter(new SimpleHasher(shape, 
2, 5), shape)));
+    }
+
+    /**
+     * Tests that merge correctly updates the counts when a 
CountingBloomFilter is
+     * passed.
+     */
+    @Test
+    public void mergeTest_Counts_CountingBloomFilter() {
+        assertMerge(counts -> createFilter(new SimpleHasher(shape, counts), 
shape),
+            BloomFilter::merge);
+    }
+
+    /**
+     * Tests that merge correctly updates the counts when a BloomFilter is 
passed.
+     */
+    @Test
+    public void mergeTest_Counts_BloomFilter() {
+        assertMerge(counts -> new BitSetBloomFilter(new SimpleHasher(shape, 
counts), shape),
+            BloomFilter::merge);
+    }
+
+    /**
+     * Test that merge correctly updates the counts when a Hasher is passed.
+     */
+    @Test
+    public void mergeTest_Counts_Hasher() {
+        assertMerge(counts -> new SimpleHasher(shape, counts),
+            BloomFilter::merge);
+    }
+
+    /**
+     * Test that merge correctly updates the counts when a Hasher is passed 
with duplicates.
+     */
+    @Test
+    public void mergeTest_Counts_Hasher_Duplicates() {
+        assertMerge(counts -> new SimpleHasher(shape, 
createDuplicates(counts)),
+            BloomFilter::merge);
+    }
+
+    /**
+     * Tests that remove correctly updates the counts when a 
CountingBloomFilter is
+     * passed.
+     */
+    @Test
+    public void removeTest_Counts_CountingBloomFilter() {
+        assertRemove(counts -> createFilter(new SimpleHasher(shape, counts), 
shape),
+            CountingBloomFilter::remove);
+    }
+
+    /**
+     * Tests that remove correctly updates the counts when a BloomFilter is 
passed.
+     */
+    @Test
+    public void removeTest_Counts_BloomFilter() {
+        assertRemove(counts -> new BitSetBloomFilter(new SimpleHasher(shape, 
counts), shape),
+            CountingBloomFilter::remove);
+    }
+
+    /**
+     * Test that remove correctly updates the counts when a Hasher is passed.
+     */
+    @Test
+    public void removeTest_Counts_Hasher() {
+        assertRemove(counts -> new SimpleHasher(shape, counts),
+            CountingBloomFilter::remove);
+    }
+
+    /**
+     * Test that remove correctly updates the counts when a Hasher is passed 
with duplicates.
+     */
+    @Test
+    public void removeTest_Counts_Hasher_Duplicates() {
+        assertRemove(counts -> new SimpleHasher(shape, 
createDuplicates(counts)),
+            CountingBloomFilter::remove);
+    }
+
+    /**
+     * Creates duplicates in the counts.
+     *
+     * @param counts the counts
+     * @return the new counts
+     */
+    private static int[] createDuplicates(int[] counts) {
+        // Duplicate some values randomly
+        final int length = counts.length;
+        int[] countsWithDuplicates = Arrays.copyOf(counts, 2 * length);
+        for (int i = length; i < countsWithDuplicates.length; i++) {
+            // Copy a random value from the counts into the end position
+            countsWithDuplicates[i] = 
countsWithDuplicates[ThreadLocalRandom.current().nextInt(i)];
+        }
+        return countsWithDuplicates;
+    }
+
+    /**
+     * Assert a merge operation. The converter should construct a suitable 
object
+     * to remove the indices from the provided Bloom filter with the remove 
operation.
+     *
+     * @param <F> the type of the filter
+     * @param converter the converter
+     * @param merge the merge operation
+     */
+    private <F> void assertMerge(Function<int[], F> converter,
+                                 BiConsumer<ArrayCountingBloomFilter, F> 
merge) {
+        final int[] indexes1 = {   1, 2,    4, 5, 6};
+        final int[] indexes2 = {         3, 4,    6};
+        final int[] expected = {0, 1, 1, 1, 2, 1, 2};
+        assertOperation(indexes1, indexes2, converter, merge, expected);
+    }
+
+    /**
+     * Assert a remove operation. The converter should construct a suitable 
object
+     * to remove the indices from the provided Bloom filter with the remove 
operation.
+     *
+     * @param <F> the type of the filter
+     * @param converter the converter
+     * @param remove the remove operation
+     */
+    private <F> void assertRemove(Function<int[], F> converter,
+                                  BiConsumer<ArrayCountingBloomFilter, F> 
remove) {
+        final int[] indexes1 = {   1, 2,    4, 5, 6};
+        final int[] indexes2 = {      2,       5, 6};
+        final int[] expected = {0, 1, 0, 0, 1, 0, 0};
+        assertOperation(indexes1, indexes2, converter, remove, expected);
+    }
+
+    /**
+     * Assert a counting operation. The first set of indexes is used to create 
the
+     * CountingBloomFilter. The second set of indices is passed to the 
converter to
+     * construct a suitable object to combine with the counting Bloom filter. 
The counts
+     * of the first Bloom filter are checked using the expected counts.
+     *
+     * <p>Counts are assumed to map to indexes starting from 0.
+     *
+     * @param <F> the type of the filter
+     * @param indexes1 the first set of indexes
+     * @param indexes2 the second set of indexes
+     * @param converter the converter
+     * @param operation the operation
+     * @param expected the expected counts after the operation
+     */
+    private <F> void assertOperation(int[] indexes1, int[] indexes2,
+            Function<int[], F> converter,
+            BiConsumer<ArrayCountingBloomFilter, F> operation,
+            int[] expected) {
+        final Hasher hasher = new SimpleHasher(shape, indexes1);
+        final ArrayCountingBloomFilter bf = createFilter(hasher, shape);
+        final F filter = converter.apply(indexes2);
+        operation.accept(bf, filter);
+        assertCounts(bf, expected);
+    }
+
+    /**
+     * Tests that merge errors when the counts overflow the maximum integer 
value.
+     */
+    @Test
+    public void mergeTest_Overflow() {
+        final Hasher hasher = new SimpleHasher(shape, 1, 2, 3);
+        final ArrayCountingBloomFilter bf = createFilter(hasher, shape);
+
+        ArrayCountingBloomFilter bf2 = createFromCounts(new int[] {0, 0, 
Integer.MAX_VALUE});
+
+        // Small + 1 = OK
+        // should not fail as the counts are ignored
+        bf.merge(bf2);
+        assertTrue(bf.isValid());
+        assertCounts(bf, new int[] {0, 1, 2, 1});
+
+        // Big + 1 = Overflow
+        assertTrue(bf2.isValid());
+        bf2.merge(bf);
+        assertFalse("Merge should overflow and the filter is invalid", 
bf2.isValid());
+
+        // The counts are not clipped to max. They have simply overflowed.
+        // Note that this is a merge and the count is only incremented by 1
+        // and not the actual count at each index. So it is not 2 + 
Integer.MAX_VALUE.
+        assertCounts(bf2, new int[] {0, 1, 1 + Integer.MAX_VALUE, 1});
+    }
+
+    /**
+     * Tests that removal errors when the counts become negative.
+     */
+    @Test
+    public void removeTest_Negative() {
+        final Hasher hasher = new SimpleHasher(shape, 1, 2, 3);
+        ArrayCountingBloomFilter bf = createFilter(hasher, shape);
+
+        final Hasher hasher2 = new SimpleHasher(shape, 2);
+        ArrayCountingBloomFilter bf2 = createFilter(hasher2, shape);
+
+        // More - Less = OK
+        bf.remove(bf2);
+        assertTrue(bf.isValid());
+        assertCounts(bf, new int[] {0, 1, 0, 1});
+
+        // Less - More = Negative
+        assertTrue(bf2.isValid());
+        bf2.remove(bf);
+        assertFalse("Remove should create negative counts and the filter is 
invalid", bf2.isValid());
+
+        // The counts are not clipped to zero. They have been left as negative.
+        assertCounts(bf2, new int[] {0, -1, 1, -1});
+    }
+
+    /**
+     * Tests that counts can be added to a new instance.
+     *
+     * <p>Note: This test ensures the CountinBloomFilter
+     * can be created with whatever counts are required for other tests.
+     */
+    @Test
+    public void addTest_NewInstance() {
+        for (final int[] counts : new int[][] {
+            { /* empty */},
+            {0, 0, 1},
+            {0, 1, 2},
+            {2, 3, 4},
+            {66, 77, 0, 99},
+            {Integer.MAX_VALUE, 42},
+        }) {
+            assertCounts(createFromCounts(counts), counts);
+        }
+    }
+
+    /**
+     * Test that add correctly ignores an empty CountingBloomFilter.
+     */
+    @Test
+    public void addTest_Empty() {
+        assertCountingOperation(new int[] {5, 2, 1},
+                                new int[0],
+                                CountingBloomFilter::add,
+                                true,
+                                new int[] {5, 2, 1});
+    }
+
+    /**
+     * Test that add correctly updates the counts when a CountingBloomFilter is
+     * passed.
+     */
+    @Test
+    public void addTest_Counts() {
+        assertCountingOperation(new int[] {5, 2, 1},
+                                new int[] {0, 6, 4, 1},
+                                CountingBloomFilter::add,
+                                true,
+                                new int[] {5, 8, 5, 1});
+    }
+
+    /**
+     * Test that add correctly updates the isValid state when a 
CountingBloomFilter is
+     * passed and an integer overflow occurs.
+     */
+    @Test
+    public void addTest_Overflow() {
+        assertCountingOperation(new int[] {5, 2, 1},
+                                new int[] {0, 6, Integer.MAX_VALUE},
+                                CountingBloomFilter::add,
+                                false,
+                                new int[] {5, 8, 1 + Integer.MAX_VALUE});
+    }
+
+    /**
+     * Test that subtract correctly ignores an empty CountingBloomFilter.
+     */
+    @Test
+    public void subtractTest_Empty() {
+        assertCountingOperation(new int[] {5, 2, 1},
+                                new int[0],
+                                CountingBloomFilter::subtract,
+                                true,
+                                new int[] {5, 2, 1});
+    }
+
+    /**
+     * Test that subtract correctly updates the counts when a 
CountingBloomFilter is
+     * passed.
+     */
+    @Test
+    public void subtractTest_Counts() {
+        assertCountingOperation(new int[] {5, 9, 1, 1},
+                                new int[] {0, 2, 1},
+                                CountingBloomFilter::subtract,
+                                true,
+                                new int[] {5, 7, 0, 1});
+    }
+
+    /**
+     * Test that subtract correctly updates the isValid state when a 
CountingBloomFilter is
+     * passed and the counts become negative.
+     */
+    @Test
+    public void subtractTest_Negative() {
+        assertCountingOperation(new int[] {5, 2, 1},
+                                new int[] {0, 6, 1},
+                                CountingBloomFilter::subtract,
+                                false,
+                                new int[] {5, -4, 0});
+    }
+
+    /**
+     * Assert a counting operation. Two CountingBloomFilters are created from 
the
+     * two sets of counts. The operation is applied and the counts of the first
+     * Bloom filter is checked using the expected counts.
+     *
+     * <p>Counts are assumed to map to indexes starting from 0.
+     *
+     * @param counts1 the first set counts
+     * @param counts2 the first set counts
+     * @param operation the operation
+     * @param isValid the expected value for the operation result
+     * @param expected the expected counts after the operation
+     */
+    private void assertCountingOperation(int[] counts1, int[] counts2,
+            BiPredicate<ArrayCountingBloomFilter, ArrayCountingBloomFilter> 
operation,
+            boolean isValid, int[] expected) {
+        final ArrayCountingBloomFilter bf1 = createFromCounts(counts1);
+        final ArrayCountingBloomFilter bf2 = createFromCounts(counts2);
+        final boolean result = operation.test(bf1, bf2);
+        assertEquals(isValid, result);
+        assertEquals(isValid, bf1.isValid());
+        assertCounts(bf1, expected);
+    }
+
+    /**
+     * Tests that the andCardinality calculation executes correctly when using 
a
+     * CountingBloomFilter argument.
+     */
+    @Test
+    public void andCardinalityTest_CountingBloomFilter() {
+        assertCardinalityOperation(new int[] {1, 1},
+                                   new int[] {1, 1},
+                                   BloomFilter::andCardinality,
+                                   2);
+        assertCardinalityOperation(new int[] {0, 1, 0, 1, 1, 1, 0, 1, 0},
+                                   new int[] {1, 1, 0, 0, 0, 1},
+                                   BloomFilter::andCardinality,
+                                   2);
+        assertCardinalityOperation(new int[] {1, 1},
+                                   new int[] {0, 0, 1, 1, 1},
+                                   BloomFilter::andCardinality,
+                                   0);
+    }
+
+    /**
+     * Tests that the orCardinality calculation executes correctly when using a
+     * CountingBloomFilter argument.
+     */
+    @Test
+    public void orCardinalityTest_CountingBloomFilter() {
+        assertCardinalityOperation(new int[] {1, 1},
+                                   new int[] {1, 1},
+                                   BloomFilter::orCardinality,
+                                   2);
+        assertCardinalityOperation(new int[] {0, 1, 0, 1, 1, 1, 0, 1, 0},
+                                   new int[] {1, 1, 0, 0, 0, 1},
+                                   BloomFilter::orCardinality,
+                                   6);
+        assertCardinalityOperation(new int[] {1, 1},
+                                   new int[] {0, 0, 1, 1, 1},
+                                   BloomFilter::orCardinality,
+                                   5);
+    }
+
+    /**
+     * Tests that the xorCardinality calculation executes correctly when using 
a
+     * CountingBloomFilter argument.
+     */
+    @Test
+    public void xorCardinalityTest_CountingBloomFilter() {
+        assertCardinalityOperation(new int[] {1, 1},
+                                   new int[] {1, 1},
+                                   BloomFilter::xorCardinality,
+                                   0);
+        assertCardinalityOperation(new int[] {0, 1, 0, 1, 1, 1, 0, 1, 0},
+                                   new int[] {1, 1, 0, 0, 0, 1},
+                                   BloomFilter::xorCardinality,
+                                   4);
+        assertCardinalityOperation(new int[] {1, 1},
+                                   new int[] {0, 0, 1, 1, 1},
+                                   BloomFilter::xorCardinality,
+                                   5);
+    }
+
+    /**
+     * Assert a cardinality operation. Two CountingBloomFilters are created 
from the
+     * two sets of counts. The operation is applied and the counts of the first
+     * Bloom filter is checked using the expected counts.
+     *
+     * <p>Counts are assumed to map to indexes starting from 0.
+     *
+     * @param counts1 the first set counts
+     * @param counts2 the first set counts
+     * @param operation the operation
+     * @param expected the expected cardinality
+     */
+    private void assertCardinalityOperation(int[] counts1, int[] counts2,
+            ToIntBiFunction<ArrayCountingBloomFilter, 
ArrayCountingBloomFilter> operation,
+            int expected) {
+        final ArrayCountingBloomFilter bf1 = createFromCounts(counts1);
+        final ArrayCountingBloomFilter bf2 = createFromCounts(counts2);
+        assertEquals(expected, operation.applyAsInt(bf1, bf2));
+        assertEquals(expected, operation.applyAsInt(bf2, bf1));
+    }
+}

[commons-collections] 02/04: Added CountingBloomFilter interface and ArrayCountingBloomFilter.

Reply via email to