gsmiller commented on code in PR #14273: URL: https://github.com/apache/lucene/pull/14273#discussion_r2017239514
########## lucene/core/src/java/org/apache/lucene/search/DISIDocIdStream.java: ########## @@ -0,0 +1,68 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search; + +import java.io.IOException; +import org.apache.lucene.util.FixedBitSet; + +final class DISIDocIdStream extends DocIdStream { + + private final DocIdSetIterator iterator; + private final int to; Review Comment: minor: maybe `max` to be consistent with the other implementations? ########## lucene/core/src/java/org/apache/lucene/search/DocIdStream.java: ########## @@ -34,12 +33,34 @@ protected DocIdStream() {} * Iterate over doc IDs contained in this stream in order, calling the given {@link * CheckedIntConsumer} on them. This is a terminal operation. */ - public abstract void forEach(CheckedIntConsumer<IOException> consumer) throws IOException; + public void forEach(CheckedIntConsumer<IOException> consumer) throws IOException { + forEach(DocIdSetIterator.NO_MORE_DOCS, consumer); + } + + /** + * Iterate over doc IDs contained in this doc ID stream up to the given {@code upTo} exclusive, + * calling the given {@link CheckedIntConsumer} on them. It is not possible to iterate these doc + * IDs again later on. + */ + public abstract void forEach(int upTo, CheckedIntConsumer<IOException> consumer) + throws IOException; /** Count the number of entries in this stream. This is a terminal operation. */ public int count() throws IOException { - int[] count = new int[1]; - forEach(_ -> count[0]++); - return count[0]; + return count(DocIdSetIterator.NO_MORE_DOCS); } + + /** + * Count the number of doc IDs in this stream that are below the given {@code upTo}. These doc IDs + * may not be consumed again later. + */ + // Note: it's abstract rather than having a default impl that delegates to #forEach because doing Review Comment: Thanks for adding this comment! +1 to the rationale as well. ########## lucene/core/src/java/org/apache/lucene/search/DISIDocIdStream.java: ########## @@ -0,0 +1,68 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search; + +import java.io.IOException; +import org.apache.lucene.util.FixedBitSet; + +final class DISIDocIdStream extends DocIdStream { + + private final DocIdSetIterator iterator; + private final int to; + private final FixedBitSet spare; + + DISIDocIdStream(DocIdSetIterator iterator, int to, FixedBitSet spare) { Review Comment: Would it make sense to offer another ctor that takes care of allocating `spare`? I suppose there's not actually a use-case for it right now since the one calling code point of this currently has a bit set that can be reused, so maybe not worth adding until there's an actual reason. ########## lucene/core/src/java/org/apache/lucene/util/FixedBitSet.java: ########## @@ -204,6 +205,39 @@ public int cardinality() { return Math.toIntExact(tot); } + /** + * Return the number of set bits between indexes {@code from} inclusive and {@code to} exclusive. + */ + public int cardinality(int from, int to) { + Objects.checkFromToIndex(from, to, length()); + + int cardinality = 0; + + // First, align `from` with a word start, ie. a multiple of Long.SIZE (64) + if ((from & 0x3F) != 0) { + long bits = this.bits[from >> 6] >>> from; + int numBitsTilNextWord = -from & 0x3F; + if (to - from < numBitsTilNextWord) { + bits &= (1L << (to - from)) - 1L; + return Long.bitCount(bits); + } + cardinality += Long.bitCount(bits); + from += numBitsTilNextWord; + } + Review Comment: minor: what about `assert (from & 0x3F) == 0;` right here? ########## lucene/core/src/java/org/apache/lucene/util/FixedBitSet.java: ########## @@ -790,4 +824,47 @@ public void applyMask(FixedBitSet bitSet, int offset) { throw new IllegalArgumentException("Some bits are set beyond the end of live docs"); } } + + /** + * For each set bit from {@code from} inclusive to {@code to} exclusive, add {@code base} to the + * bit index and call {@code consumer} on it. This is internally used by queries that use bit sets + * as intermediate representations of their matches. + */ + public void forEach(int from, int to, int base, CheckedIntConsumer<IOException> consumer) + throws IOException { + Objects.checkFromToIndex(from, to, length()); + + // First, align `from` with a word start, ie. a multiple of Long.SIZE (64) + if ((from & 0x3F) != 0) { + long bits = this.bits[from >> 6] >>> from; + int numBitsTilNextWord = -from & 0x3F; + if (to - from < numBitsTilNextWord) { + // All bits are in a single word + bits &= (1L << (to - from)) - 1L; + forEach(bits, from + base, consumer); + return; + } + forEach(bits, from + base, consumer); + from += numBitsTilNextWord; + } + Review Comment: minor: same suggestion of `assert (from & 0x3F) == 0;` ########## lucene/core/src/java/org/apache/lucene/search/BitSetDocIdStream.java: ########## @@ -0,0 +1,60 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search; + +import java.io.IOException; +import org.apache.lucene.util.FixedBitSet; + +final class BitSetDocIdStream extends DocIdStream { + + private final FixedBitSet bitSet; + private final int offset, max; + private int upTo; + + BitSetDocIdStream(FixedBitSet bitSet, int offset) { + this.bitSet = bitSet; + this.offset = offset; + upTo = offset; + max = (int) Math.min(Integer.MAX_VALUE, (long) offset + bitSet.length()); + } + + @Override + public boolean mayHaveRemaining() { + return upTo < max; + } + + @Override + public void forEach(int upTo, CheckedIntConsumer<IOException> consumer) throws IOException { + if (upTo >= this.upTo) { + upTo = Math.min(upTo, max); + bitSet.forEach(this.upTo - offset, upTo - offset, offset, consumer); + this.upTo = upTo; + } + } + + @Override + public int count(int upTo) throws IOException { + if (upTo >= this.upTo) { Review Comment: Another place where I think you want `>` ########## lucene/core/src/java/org/apache/lucene/search/BooleanScorer.java: ########## @@ -207,8 +164,32 @@ private void scoreWindowIntoBitSetAndReplay( acceptDocs.applyMask(matching, base); } - docIdStreamView.base = base; - collector.collect(docIdStreamView); + if (buckets == null) { + if (acceptDocs != null) { + // In this case, live docs have not been applied yet. + acceptDocs.applyMask(matching, base); + } Review Comment: If I'm looking at this diff properly, I don't think you should need this block of code at all since you're still applying `acceptDocs` immediately prior? ########## lucene/core/src/java/org/apache/lucene/search/DISIDocIdStream.java: ########## @@ -0,0 +1,68 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search; + +import java.io.IOException; +import org.apache.lucene.util.FixedBitSet; + +final class DISIDocIdStream extends DocIdStream { + + private final DocIdSetIterator iterator; + private final int to; + private final FixedBitSet spare; + + DISIDocIdStream(DocIdSetIterator iterator, int to, FixedBitSet spare) { + if (to - iterator.docID() > spare.length()) { + throw new IllegalArgumentException("Bit set is too small to hold all potential matches"); + } + this.iterator = iterator; + this.to = to; + this.spare = spare; + } + + @Override + public boolean mayHaveRemaining() { + return iterator.docID() < to; + } + + @Override + public void forEach(int upTo, CheckedIntConsumer<IOException> consumer) throws IOException { + // If there are no live docs to apply, loading matching docs into a bit set and then iterating + // bits is unlikely to beat iterating the iterator directly. + upTo = Math.min(upTo, to); + for (int doc = iterator.docID(); doc < upTo; doc = iterator.nextDoc()) { + consumer.accept(doc); + } + } + + @Override + public int count(int upTo) throws IOException { + if (iterator.docID() >= upTo) { + return 0; + } + // If the collector is just interested in the count, loading in a bit set and counting bits is + // often faster than incrementing a counter on every call to nextDoc(). + assert spare.scanIsEmpty(); Review Comment: Nice. I appreciate this assert in here! ########## lucene/core/src/java/org/apache/lucene/search/RangeDocIdStream.java: ########## @@ -0,0 +1,61 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search; + +import java.io.IOException; + +final class RangeDocIdStream extends DocIdStream { + + private int upTo; + private final int max; + + RangeDocIdStream(int min, int max) { + if (max < min) { + throw new IllegalArgumentException("min = " + min + " > max = " + max); + } + this.upTo = min; + this.max = max; + } + + @Override + public boolean mayHaveRemaining() { + return upTo < max; + } + + @Override + public void forEach(int upTo, CheckedIntConsumer<IOException> consumer) throws IOException { + if (upTo >= this.upTo) { Review Comment: Another place where I think you want `>` ########## lucene/core/src/java/org/apache/lucene/search/BitSetDocIdStream.java: ########## @@ -0,0 +1,60 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search; + +import java.io.IOException; +import org.apache.lucene.util.FixedBitSet; + +final class BitSetDocIdStream extends DocIdStream { + + private final FixedBitSet bitSet; + private final int offset, max; + private int upTo; + + BitSetDocIdStream(FixedBitSet bitSet, int offset) { + this.bitSet = bitSet; + this.offset = offset; + upTo = offset; + max = (int) Math.min(Integer.MAX_VALUE, (long) offset + bitSet.length()); + } + + @Override + public boolean mayHaveRemaining() { + return upTo < max; + } + + @Override + public void forEach(int upTo, CheckedIntConsumer<IOException> consumer) throws IOException { + if (upTo >= this.upTo) { Review Comment: I think you want `>` here instead? (`>=` still functionally works since `forEach` is tolerant of the `==` case, but I think you want to short circuit if `upTo == this.upTo`?) (Also possible I'm getting this wrong... but I did make the changes locally and all the testing still seems to pass) ########## lucene/core/src/java/org/apache/lucene/search/RangeDocIdStream.java: ########## @@ -0,0 +1,61 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search; + +import java.io.IOException; + +final class RangeDocIdStream extends DocIdStream { + + private int upTo; + private final int max; + + RangeDocIdStream(int min, int max) { + if (max < min) { + throw new IllegalArgumentException("min = " + min + " > max = " + max); + } + this.upTo = min; + this.max = max; + } + + @Override + public boolean mayHaveRemaining() { + return upTo < max; + } + + @Override + public void forEach(int upTo, CheckedIntConsumer<IOException> consumer) throws IOException { + if (upTo >= this.upTo) { + upTo = Math.min(upTo, max); + for (int doc = this.upTo; doc < upTo; ++doc) { + consumer.accept(doc); + } + this.upTo = upTo; + } + } + + @Override + public int count(int upTo) throws IOException { + if (upTo >= this.upTo) { Review Comment: And one more spot where I think you want `>`? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org