This is an automated email from the ASF dual-hosted git repository.

aherbert pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/commons-collections.git

commit fe88827643be091df5ed11d89d5f2e2ec4c61892
Author: Alex Herbert <aherb...@apache.org>
AuthorDate: Sun Mar 8 13:48:10 2020 +0000

    Move the unique filtering of the Hasher indexes to a separate class.
---
 .../bloomfilter/ArrayCountingBloomFilter.java      |  19 +---
 .../collections4/bloomfilter/IndexFilters.java     |  84 +++++++++++++++++
 .../bloomfilter/ArrayCountingBloomFilterTest.java  |  63 ++++---------
 .../bloomfilter/FixedIndexesTestHasher.java        |  67 ++++++++++++++
 .../collections4/bloomfilter/IndexFilterTest.java  | 103 +++++++++++++++++++++
 5 files changed, 272 insertions(+), 64 deletions(-)

diff --git 
a/src/main/java/org/apache/commons/collections4/bloomfilter/ArrayCountingBloomFilter.java
 
b/src/main/java/org/apache/commons/collections4/bloomfilter/ArrayCountingBloomFilter.java
index 9538f32..55e503f 100644
--- 
a/src/main/java/org/apache/commons/collections4/bloomfilter/ArrayCountingBloomFilter.java
+++ 
b/src/main/java/org/apache/commons/collections4/bloomfilter/ArrayCountingBloomFilter.java
@@ -17,13 +17,10 @@
 package org.apache.commons.collections4.bloomfilter;
 
 import java.util.BitSet;
-import java.util.HashSet;
 import java.util.NoSuchElementException;
 import java.util.PrimitiveIterator;
 import java.util.PrimitiveIterator.OfInt;
-import java.util.function.Consumer;
 import java.util.function.IntConsumer;
-import java.util.Set;
 
 import org.apache.commons.collections4.bloomfilter.hasher.Hasher;
 import org.apache.commons.collections4.bloomfilter.hasher.Shape;
@@ -324,7 +321,8 @@ public class ArrayCountingBloomFilter extends 
AbstractBloomFilter implements Cou
      */
     private void applyAsHasher(final Hasher hasher, final IntConsumer action) {
         verifyHasher(hasher);
-        toSet(hasher).forEach(i -> action.accept(i));
+        // We do not naturally handle duplicates so filter them.
+        IndexFilters.distinctIndexes(hasher, getShape(), action);
     }
 
     /**
@@ -380,17 +378,4 @@ public class ArrayCountingBloomFilter extends 
AbstractBloomFilter implements Cou
         state |= updated;
         counts[idx] = updated;
     }
-
-    /**
-     * Convert the hasher to a set. Duplicates indexes are removed.
-     * The unique indexes can be used to increment or decrement counts by 1.
-     *
-     * @param hasher the hasher
-     * @return the unique indexes
-     */
-    private Set<Integer> toSet(Hasher hasher) {
-        final Set<Integer> lst = new HashSet<>();
-        hasher.getBits(getShape()).forEachRemaining((Consumer<Integer>) 
lst::add);
-        return lst;
-    }
 }
diff --git 
a/src/main/java/org/apache/commons/collections4/bloomfilter/IndexFilters.java 
b/src/main/java/org/apache/commons/collections4/bloomfilter/IndexFilters.java
new file mode 100644
index 0000000..c9b81ba
--- /dev/null
+++ 
b/src/main/java/org/apache/commons/collections4/bloomfilter/IndexFilters.java
@@ -0,0 +1,84 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.collections4.bloomfilter;
+
+import org.apache.commons.collections4.bloomfilter.hasher.Hasher;
+import org.apache.commons.collections4.bloomfilter.hasher.Shape;
+
+import java.util.Objects;
+import java.util.Set;
+import java.util.TreeSet;
+import java.util.function.Consumer;
+import java.util.function.IntConsumer;
+
+/**
+ * Contains functions to filter indexes.
+ */
+final class IndexFilters {
+    /** Do not instantiate. */
+    private IndexFilters() {
+    }
+
+    /**
+     * Transfer all distinct indexes in the specified {@code hasher} generated 
for the
+     * specified {@code shape} to the specified {@code consumer}. For example 
this
+     * can be used to merge a {@link Hasher} representation of a Bloom filter 
into a
+     * {@link BloomFilter} instance that does not naturally handle duplicate 
indexes.
+     *
+     * <p>This method is functionally equivalent to:
+     *
+     * <pre>
+     *     final Set&lt;Integer&gt; distinct = new TreeSet&lt;&gt;();
+     *     hasher.getBits(shape).forEachRemaining((Consumer&lt;Integer&gt;) i 
-&gt; {
+     *         if (distinct.add(i)) {
+     *             consumer.accept(i);
+     *         }
+     *     });
+     * </pre>
+     *
+     * @param hasher the hasher
+     * @param shape the shape
+     * @param consumer the consumer to receive distinct indexes
+     * @throws NullPointerException if the hasher, shape or action are null
+     * @see Hasher#getBits(Shape)
+     */
+    static void distinctIndexes(Hasher hasher, Shape shape, IntConsumer 
consumer) {
+        Objects.requireNonNull(hasher, "hasher");
+        Objects.requireNonNull(shape, "shape");
+        Objects.requireNonNull(consumer, "consumer");
+
+        // TODO
+        // This function can be optimised based on the expected size
+        // (number of indexes) of the hasher and the number of bits in the 
shape.
+        //
+        // A large size would benefit from a pre-allocated BitSet-type filter.
+        // A very small size may be more efficient as a simple array of values
+        // that have already been seen that is scanned for each new index.
+        //
+        // A default is to use a Set to filter distinct values. The choice of 
set
+        // should be evaluated. A HashSet would be optimal if size is known.
+        // A TreeSet has lower memory consumption and performance is not as
+        // sensitive to knowing the size in advance.
+
+        final Set<Integer> distinct = new TreeSet<>();
+        hasher.getBits(shape).forEachRemaining((Consumer<Integer>) i -> {
+            if (distinct.add(i)) {
+                consumer.accept(i);
+            }
+        });
+    }
+}
diff --git 
a/src/test/java/org/apache/commons/collections4/bloomfilter/ArrayCountingBloomFilterTest.java
 
b/src/test/java/org/apache/commons/collections4/bloomfilter/ArrayCountingBloomFilterTest.java
index 0de90c8..1940287 100644
--- 
a/src/test/java/org/apache/commons/collections4/bloomfilter/ArrayCountingBloomFilterTest.java
+++ 
b/src/test/java/org/apache/commons/collections4/bloomfilter/ArrayCountingBloomFilterTest.java
@@ -23,14 +23,12 @@ import static org.junit.Assert.assertTrue;
 import java.util.Arrays;
 import java.util.HashMap;
 import java.util.Map;
-import java.util.PrimitiveIterator.OfInt;
 import java.util.concurrent.ThreadLocalRandom;
 import java.util.function.BiConsumer;
 import java.util.function.BiPredicate;
 import java.util.function.Function;
 import java.util.function.ToIntBiFunction;
 
-import org.apache.commons.collections4.bloomfilter.hasher.HashFunctionIdentity;
 import org.apache.commons.collections4.bloomfilter.hasher.Hasher;
 import org.apache.commons.collections4.bloomfilter.hasher.Shape;
 import org.junit.Test;
@@ -40,35 +38,6 @@ import org.junit.Test;
  */
 public class ArrayCountingBloomFilterTest extends AbstractBloomFilterTest {
 
-    /**
-     * A simple Hasher implementation to return indexes. Duplicates are 
allowed.
-     */
-    private static class SimpleHasher implements Hasher {
-        private Shape shape;
-        private int[] indexes;
-
-        SimpleHasher(Shape shape, int... indexes) {
-            this.shape = shape;
-            this.indexes = indexes;
-        }
-
-        @Override
-        public OfInt getBits(Shape shape) {
-            // Assume shape is the same
-            return Arrays.stream(indexes).iterator();
-        }
-
-        @Override
-        public HashFunctionIdentity getHashFunctionIdentity() {
-            return shape.getHashFunctionIdentity();
-        }
-
-        @Override
-        public boolean isEmpty() {
-            return indexes.length == 0;
-        }
-    }
-
     @Override
     protected ArrayCountingBloomFilter createEmptyFilter(final Shape shape) {
         return new ArrayCountingBloomFilter(shape);
@@ -124,7 +93,7 @@ public class ArrayCountingBloomFilterTest extends 
AbstractBloomFilterTest {
     public void constructorTest_Hasher_Duplicates() {
         final int[] expected = {0, 1, 1, 0, 0, 1};
         // Some indexes with duplicates
-        final Hasher hasher = new SimpleHasher(shape, 1, 2, 2, 5);
+        final Hasher hasher = new FixedIndexesTestHasher(shape, 1, 2, 2, 5);
 
         final ArrayCountingBloomFilter bf = createFilter(hasher, shape);
         final long[] lb = bf.getBits();
@@ -141,10 +110,10 @@ public class ArrayCountingBloomFilterTest extends 
AbstractBloomFilterTest {
     @Test
     public void contains_BloomFilter() {
         // Some indexes with duplicates
-        final Hasher hasher = new SimpleHasher(shape, 1, 2, 5);
+        final Hasher hasher = new FixedIndexesTestHasher(shape, 1, 2, 5);
         final ArrayCountingBloomFilter bf = createFilter(hasher, shape);
-        assertFalse(bf.contains(new BitSetBloomFilter(new SimpleHasher(shape, 
3, 4), shape)));
-        assertTrue(bf.contains(new BitSetBloomFilter(new SimpleHasher(shape, 
2, 5), shape)));
+        assertFalse(bf.contains(new BitSetBloomFilter(new 
FixedIndexesTestHasher(shape, 3, 4), shape)));
+        assertTrue(bf.contains(new BitSetBloomFilter(new 
FixedIndexesTestHasher(shape, 2, 5), shape)));
     }
 
     /**
@@ -153,7 +122,7 @@ public class ArrayCountingBloomFilterTest extends 
AbstractBloomFilterTest {
      */
     @Test
     public void mergeTest_Counts_CountingBloomFilter() {
-        assertMerge(counts -> createFilter(new SimpleHasher(shape, counts), 
shape),
+        assertMerge(counts -> createFilter(new FixedIndexesTestHasher(shape, 
counts), shape),
             BloomFilter::merge);
     }
 
@@ -162,7 +131,7 @@ public class ArrayCountingBloomFilterTest extends 
AbstractBloomFilterTest {
      */
     @Test
     public void mergeTest_Counts_BloomFilter() {
-        assertMerge(counts -> new BitSetBloomFilter(new SimpleHasher(shape, 
counts), shape),
+        assertMerge(counts -> new BitSetBloomFilter(new 
FixedIndexesTestHasher(shape, counts), shape),
             BloomFilter::merge);
     }
 
@@ -171,7 +140,7 @@ public class ArrayCountingBloomFilterTest extends 
AbstractBloomFilterTest {
      */
     @Test
     public void mergeTest_Counts_Hasher() {
-        assertMerge(counts -> new SimpleHasher(shape, counts),
+        assertMerge(counts -> new FixedIndexesTestHasher(shape, counts),
             BloomFilter::merge);
     }
 
@@ -180,7 +149,7 @@ public class ArrayCountingBloomFilterTest extends 
AbstractBloomFilterTest {
      */
     @Test
     public void mergeTest_Counts_Hasher_Duplicates() {
-        assertMerge(counts -> new SimpleHasher(shape, 
createDuplicates(counts)),
+        assertMerge(counts -> new FixedIndexesTestHasher(shape, 
createDuplicates(counts)),
             BloomFilter::merge);
     }
 
@@ -190,7 +159,7 @@ public class ArrayCountingBloomFilterTest extends 
AbstractBloomFilterTest {
      */
     @Test
     public void removeTest_Counts_CountingBloomFilter() {
-        assertRemove(counts -> createFilter(new SimpleHasher(shape, counts), 
shape),
+        assertRemove(counts -> createFilter(new FixedIndexesTestHasher(shape, 
counts), shape),
             CountingBloomFilter::remove);
     }
 
@@ -199,7 +168,7 @@ public class ArrayCountingBloomFilterTest extends 
AbstractBloomFilterTest {
      */
     @Test
     public void removeTest_Counts_BloomFilter() {
-        assertRemove(counts -> new BitSetBloomFilter(new SimpleHasher(shape, 
counts), shape),
+        assertRemove(counts -> new BitSetBloomFilter(new 
FixedIndexesTestHasher(shape, counts), shape),
             CountingBloomFilter::remove);
     }
 
@@ -208,7 +177,7 @@ public class ArrayCountingBloomFilterTest extends 
AbstractBloomFilterTest {
      */
     @Test
     public void removeTest_Counts_Hasher() {
-        assertRemove(counts -> new SimpleHasher(shape, counts),
+        assertRemove(counts -> new FixedIndexesTestHasher(shape, counts),
             CountingBloomFilter::remove);
     }
 
@@ -217,7 +186,7 @@ public class ArrayCountingBloomFilterTest extends 
AbstractBloomFilterTest {
      */
     @Test
     public void removeTest_Counts_Hasher_Duplicates() {
-        assertRemove(counts -> new SimpleHasher(shape, 
createDuplicates(counts)),
+        assertRemove(counts -> new FixedIndexesTestHasher(shape, 
createDuplicates(counts)),
             CountingBloomFilter::remove);
     }
 
@@ -289,7 +258,7 @@ public class ArrayCountingBloomFilterTest extends 
AbstractBloomFilterTest {
             Function<int[], F> converter,
             BiConsumer<ArrayCountingBloomFilter, F> operation,
             int[] expected) {
-        final Hasher hasher = new SimpleHasher(shape, indexes1);
+        final Hasher hasher = new FixedIndexesTestHasher(shape, indexes1);
         final ArrayCountingBloomFilter bf = createFilter(hasher, shape);
         final F filter = converter.apply(indexes2);
         operation.accept(bf, filter);
@@ -301,7 +270,7 @@ public class ArrayCountingBloomFilterTest extends 
AbstractBloomFilterTest {
      */
     @Test
     public void mergeTest_Overflow() {
-        final Hasher hasher = new SimpleHasher(shape, 1, 2, 3);
+        final Hasher hasher = new FixedIndexesTestHasher(shape, 1, 2, 3);
         final ArrayCountingBloomFilter bf = createFilter(hasher, shape);
 
         ArrayCountingBloomFilter bf2 = createFromCounts(new int[] {0, 0, 
Integer.MAX_VALUE});
@@ -328,10 +297,10 @@ public class ArrayCountingBloomFilterTest extends 
AbstractBloomFilterTest {
      */
     @Test
     public void removeTest_Negative() {
-        final Hasher hasher = new SimpleHasher(shape, 1, 2, 3);
+        final Hasher hasher = new FixedIndexesTestHasher(shape, 1, 2, 3);
         ArrayCountingBloomFilter bf = createFilter(hasher, shape);
 
-        final Hasher hasher2 = new SimpleHasher(shape, 2);
+        final Hasher hasher2 = new FixedIndexesTestHasher(shape, 2);
         ArrayCountingBloomFilter bf2 = createFilter(hasher2, shape);
 
         // More - Less = OK
diff --git 
a/src/test/java/org/apache/commons/collections4/bloomfilter/FixedIndexesTestHasher.java
 
b/src/test/java/org/apache/commons/collections4/bloomfilter/FixedIndexesTestHasher.java
new file mode 100644
index 0000000..4477a93
--- /dev/null
+++ 
b/src/test/java/org/apache/commons/collections4/bloomfilter/FixedIndexesTestHasher.java
@@ -0,0 +1,67 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.collections4.bloomfilter;
+
+import org.apache.commons.collections4.bloomfilter.hasher.HashFunctionIdentity;
+import org.apache.commons.collections4.bloomfilter.hasher.Hasher;
+import org.apache.commons.collections4.bloomfilter.hasher.Shape;
+
+import java.util.Arrays;
+import java.util.PrimitiveIterator.OfInt;
+
+/**
+ * A Hasher implementation to return fixed indexes. Duplicates are allowed.
+ * The shape is ignored when generating the indexes.
+ *
+ * <p><strong>This is not a real hasher and is used for testing only</strong>.
+ */
+class FixedIndexesTestHasher implements Hasher {
+    /** The shape. */
+    private Shape shape;
+    /** The indexes. */
+    private int[] indexes;
+
+    /**
+     * Create an instance.
+     *
+     * @param shape the shape
+     * @param indexes the indexes
+     */
+    FixedIndexesTestHasher(Shape shape, int... indexes) {
+        this.shape = shape;
+        this.indexes = indexes;
+    }
+
+    @Override
+    public OfInt getBits(Shape shape) {
+        if (!this.shape.equals(shape)) {
+            throw new IllegalArgumentException(
+                String.format("shape (%s) does not match internal shape (%s)", 
shape, this.shape));
+        }
+        return Arrays.stream(indexes).iterator();
+    }
+
+    @Override
+    public HashFunctionIdentity getHashFunctionIdentity() {
+        return shape.getHashFunctionIdentity();
+    }
+
+    @Override
+    public boolean isEmpty() {
+        return indexes.length == 0;
+    }
+}
diff --git 
a/src/test/java/org/apache/commons/collections4/bloomfilter/IndexFilterTest.java
 
b/src/test/java/org/apache/commons/collections4/bloomfilter/IndexFilterTest.java
new file mode 100644
index 0000000..778960d
--- /dev/null
+++ 
b/src/test/java/org/apache/commons/collections4/bloomfilter/IndexFilterTest.java
@@ -0,0 +1,103 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.collections4.bloomfilter;
+
+import 
org.apache.commons.collections4.bloomfilter.hasher.HashFunctionIdentityImpl;
+import org.apache.commons.collections4.bloomfilter.hasher.Shape;
+import 
org.apache.commons.collections4.bloomfilter.hasher.HashFunctionIdentity.ProcessType;
+import 
org.apache.commons.collections4.bloomfilter.hasher.HashFunctionIdentity.Signedness;
+import org.junit.Assert;
+import org.junit.Test;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Set;
+import java.util.function.IntConsumer;
+import java.util.stream.Collectors;
+
+/**
+ * Tests for the {@link IndexFilters}.
+ */
+public class IndexFilterTest {
+
+    /**
+     * The shape of the dummy Bloom filter.
+     * This is used as an argument to a Hasher that just returns fixed indexes
+     * so the parameters do not matter.
+     */
+    private Shape shape = new Shape(new HashFunctionIdentityImpl(
+        "Apache Commons Collections", "Dummy", Signedness.SIGNED, 
ProcessType.CYCLIC, 0L),
+        50, 3000, 4);
+
+    @Test
+    public void testApplyThrowsWithNullArguments() {
+        final FixedIndexesTestHasher hasher = new 
FixedIndexesTestHasher(shape, 1, 2, 3);
+        final Shape shape = this.shape;
+        final ArrayList<Integer> actual = new ArrayList<>();
+        final IntConsumer consumer = actual::add;
+
+        try {
+            IndexFilters.distinctIndexes(null, shape, consumer);
+            Assert.fail("null hasher");
+        } catch (NullPointerException expected) {
+            // Ignore
+        }
+
+        try {
+            IndexFilters.distinctIndexes(hasher, null, consumer);
+            Assert.fail("null shape");
+        } catch (NullPointerException expected) {
+            // Ignore
+        }
+
+        try {
+            IndexFilters.distinctIndexes(hasher, shape, null);
+            Assert.fail("null consumer");
+        } catch (NullPointerException expected) {
+            // Ignore
+        }
+
+        // All OK together
+        IndexFilters.distinctIndexes(hasher, shape, consumer);
+    }
+
+    @Test
+    public void testApply() {
+        assertFilter(1, 4, 6, 7, 9);
+    }
+
+    @Test
+    public void testApplyWithDuplicates() {
+        assertFilter(1, 4, 4, 6, 7, 7, 7, 7, 7, 9);
+    }
+
+    private void assertFilter(int... indexes) {
+        final FixedIndexesTestHasher hasher = new 
FixedIndexesTestHasher(shape, indexes);
+        final Set<Integer> expected = 
Arrays.stream(indexes).boxed().collect(Collectors.toSet());
+        final ArrayList<Integer> actual = new ArrayList<>();
+
+        IndexFilters.distinctIndexes(hasher, shape, actual::add);
+
+        Assert.assertEquals(expected.size(), actual.size());
+        // Check the array has all the values.
+        // We do not currently check the order of indexes from the
+        // hasher.getBits() function.
+        for (Integer index : actual) {
+            Assert.assertTrue(expected.contains(index));
+        }
+    }
+}

Reply via email to