jpountz commented on a change in pull request #438: URL: https://github.com/apache/lucene/pull/438#discussion_r759955885
########## File path: lucene/core/src/java/org/apache/lucene/util/DocBaseBitSetIterator.java ########## @@ -0,0 +1,81 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.util; + +import org.apache.lucene.search.DocIdSetIterator; + +/** + * A @{@link DocIdSetIterator} like {@link BitSetIterator} but has a doc base in onder to avoid + * storing previous 0s. + */ +public class DocBaseBitSetIterator extends DocIdSetIterator { + + private final FixedBitSet bits; + private final int length; + private final long cost; + private final int docBase; + private int doc = -1; + + public DocBaseBitSetIterator(FixedBitSet bits, long cost, int docBase) { + if (cost < 0) { + throw new IllegalArgumentException("cost must be >= 0, got " + cost); + } + if ((docBase & 63) != 0) { + throw new IllegalArgumentException("docBase need to be a multiple of 64"); + } + this.bits = bits; + this.length = bits.length() + docBase; + this.cost = cost; + this.docBase = docBase; + } + + public FixedBitSet getBitSet() { Review comment: Can you add javadocs? ########## File path: lucene/core/src/java/org/apache/lucene/util/DocBaseBitSetIterator.java ########## @@ -0,0 +1,81 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.util; + +import org.apache.lucene.search.DocIdSetIterator; + +/** + * A @{@link DocIdSetIterator} like {@link BitSetIterator} but has a doc base in onder to avoid Review comment: ```suggestion * A {@link DocIdSetIterator} like {@link BitSetIterator} but has a doc base in onder to avoid ``` ########## File path: lucene/core/src/java/org/apache/lucene/util/bkd/DocIdsWriter.java ########## @@ -85,10 +94,65 @@ static void writeDocIds(int[] docIds, int start, int count, DataOutput out) thro } } + private static boolean isSorted(int[] docIds, int start, int count) { + for (int i = 1; i < count; ++i) { + if (docIds[start + i - 1] > docIds[start + i]) { + return false; + } + } + return true; + } + + private static boolean isStrictlySorted(int[] docIds, int start, int count) { + assert isSorted(docIds, start, count); + for (int i = 1; i < count; ++i) { + if (docIds[start + i - 1] == docIds[start + i]) { + return false; + } + } + return true; + } + + private static void writeIdsAsBitSet(int[] docIds, int start, int count, DataOutput out) + throws IOException { + int min = docIds[start]; + int max = docIds[start + count - 1]; + + final int offsetWords = min >> 6; + final int offsetBits = offsetWords << 6; + final int totalWordCount = FixedBitSet.bits2words(max - offsetBits + 1); + long currentWord = 0; + int currentWordIndex = 0; + + out.writeVInt(offsetBits); Review comment: let's write offsetWords instead since it's smaller? ########## File path: lucene/core/src/java/org/apache/lucene/util/DocBaseBitSetIterator.java ########## @@ -0,0 +1,81 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.util; + +import org.apache.lucene.search.DocIdSetIterator; + +/** + * A @{@link DocIdSetIterator} like {@link BitSetIterator} but has a doc base in onder to avoid + * storing previous 0s. + */ +public class DocBaseBitSetIterator extends DocIdSetIterator { + + private final FixedBitSet bits; + private final int length; + private final long cost; + private final int docBase; + private int doc = -1; + + public DocBaseBitSetIterator(FixedBitSet bits, long cost, int docBase) { + if (cost < 0) { + throw new IllegalArgumentException("cost must be >= 0, got " + cost); + } + if ((docBase & 63) != 0) { + throw new IllegalArgumentException("docBase need to be a multiple of 64"); + } + this.bits = bits; + this.length = bits.length() + docBase; + this.cost = cost; + this.docBase = docBase; + } + + public FixedBitSet getBitSet() { + return bits; + } + + @Override + public int docID() { + return doc; + } + + public int getDocBase() { Review comment: Can you add javadocs? ########## File path: lucene/core/src/java/org/apache/lucene/util/bkd/DocIdsWriter.java ########## @@ -18,23 +18,32 @@ import java.io.IOException; import org.apache.lucene.index.PointValues.IntersectVisitor; +import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.store.DataOutput; import org.apache.lucene.store.IndexInput; +import org.apache.lucene.util.DocBaseBitSetIterator; +import org.apache.lucene.util.FixedBitSet; class DocIdsWriter { private DocIdsWriter() {} - static void writeDocIds(int[] docIds, int start, int count, DataOutput out) throws IOException { + static void writeDocIds( + int[] docIds, int start, int count, DataOutput out, boolean consistentValue) + throws IOException { + if (consistentValue Review comment: So we only check this boolean as a way to avoid running `isStrictlyOrdered` since fields with low cardinality are the most likely to trigger this optimization, is my understanding correct? I'm tempted to remove this check so that your new optimization would also kick in when the index is sorted by this field, or when index order matches a field's order. We already perform some heavier operations at the leaf level like sorting by value. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org