[GitHub] [lucene] iverase commented on a change in pull request #7: LUCENE-9820: Separate logic for reading the BKD index from logic to intersecting it

GitBox Mon, 11 Oct 2021 05:21:27 -0700


iverase commented on a change in pull request #7:
URL: https://github.com/apache/lucene/pull/7#discussion_r726066983




##########
File path: lucene/core/src/java/org/apache/lucene/util/bkd/BKDDefaultReader.java
##########
@@ -0,0 +1,923 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.util.bkd;
+
+import java.io.IOException;
+import org.apache.lucene.codecs.CodecUtil;
+import org.apache.lucene.index.CorruptIndexException;
+import org.apache.lucene.index.PointValues;
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.MathUtil;
+
+/**
+ * Handles reading a block KD-tree previously written with {@link BKDWriter}.
+ *
+ * @lucene.experimental
+ */
+public class BKDDefaultReader implements BKDReader {
+
+  final BKDConfig config;
+  final int numLeaves;
+  // Packed array of byte[] holding all docs and values:
+  final IndexInput in;
+  final byte[] minPackedValue;
+  final byte[] maxPackedValue;
+  final long pointCount;
+  final int docCount;
+  final int version;
+  final long minLeafBlockFP;
+  // Packed array of byte[] holding all split values in the full binary tree:
+  private final IndexInput packedIndex;
+
+  /**
+   * Caller must pre-seek the provided {@link IndexInput} to the index 
location that {@link
+   * BKDWriter#finish} returned. BKD tree is always stored off-heap.
+   */
+  public BKDDefaultReader(IndexInput metaIn, IndexInput indexIn, IndexInput 
dataIn)
+      throws IOException {
+    version =
+        CodecUtil.checkHeader(
+            metaIn, BKDWriter.CODEC_NAME, BKDWriter.VERSION_START, 
BKDWriter.VERSION_CURRENT);
+    final int numDims = metaIn.readVInt();
+    final int numIndexDims;
+    if (version >= BKDWriter.VERSION_SELECTIVE_INDEXING) {
+      numIndexDims = metaIn.readVInt();
+    } else {
+      numIndexDims = numDims;
+    }
+    final int maxPointsInLeafNode = metaIn.readVInt();
+    final int bytesPerDim = metaIn.readVInt();
+    config = new BKDConfig(numDims, numIndexDims, bytesPerDim, 
maxPointsInLeafNode);
+
+    // Read index:
+    numLeaves = metaIn.readVInt();
+    assert numLeaves > 0;
+
+    minPackedValue = new byte[config.packedIndexBytesLength];
+    maxPackedValue = new byte[config.packedIndexBytesLength];
+
+    metaIn.readBytes(minPackedValue, 0, config.packedIndexBytesLength);
+    metaIn.readBytes(maxPackedValue, 0, config.packedIndexBytesLength);
+    final ArrayUtil.ByteArrayComparator comparator =
+        ArrayUtil.getUnsignedComparator(config.bytesPerDim);
+    for (int dim = 0; dim < config.numIndexDims; dim++) {
+      if (comparator.compare(
+              minPackedValue, dim * config.bytesPerDim, maxPackedValue, dim * 
config.bytesPerDim)
+          > 0) {
+        throw new CorruptIndexException(
+            "minPackedValue "
+                + new BytesRef(minPackedValue)
+                + " is > maxPackedValue "
+                + new BytesRef(maxPackedValue)
+                + " for dim="
+                + dim,
+            metaIn);
+      }
+    }
+
+    pointCount = metaIn.readVLong();
+    docCount = metaIn.readVInt();
+
+    int numIndexBytes = metaIn.readVInt();
+    long indexStartPointer;
+    if (version >= BKDWriter.VERSION_META_FILE) {
+      minLeafBlockFP = metaIn.readLong();
+      indexStartPointer = metaIn.readLong();
+    } else {
+      indexStartPointer = indexIn.getFilePointer();
+      minLeafBlockFP = indexIn.readVLong();
+      indexIn.seek(indexStartPointer);
+    }
+    this.packedIndex = indexIn.slice("packedIndex", indexStartPointer, 
numIndexBytes);
+    this.in = dataIn;
+  }
+
+  @Override
+  public BKDConfig getConfig() {
+    return config;
+  }
+
+  @Override
+  public byte[] getMinPackedValue() {
+    return minPackedValue.clone();
+  }
+
+  @Override
+  public byte[] getMaxPackedValue() {
+    return maxPackedValue.clone();
+  }
+
+  @Override
+  public long getPointCount() {
+    return pointCount;
+  }
+
+  @Override
+  public int getDocCount() {
+    return docCount;
+  }
+
+  @Override
+  public BKDReader.IndexTree getIndexTree() throws IOException {
+    return new IndexTree(
+        packedIndex.clone(),
+        this.in.clone(),
+        config,
+        numLeaves,
+        version,
+        pointCount,
+        minPackedValue,
+        maxPackedValue);
+  }
+
+  private static class IndexTree implements BKDReader.IndexTree {
+    private int nodeID;
+    // during clone, the node root can be different to 1
+    private final int nodeRoot;
+    // level is 1-based so that we can do level-1 w/o checking each time:
+    private int level;
+    // used to read the packed tree off-heap
+    private final IndexInput innerNodes;
+    // used to read the packed leaves off-heap
+    private final IndexInput leafNodes;
+    // holds the minimum (left most) leaf block file pointer for each level 
we've recursed to:
+    private final long[] leafBlockFPStack;
+    // holds the address, in the off-heap index, of the right-node of each 
level:
+    private final int[] rightNodePositions;
+    // holds the splitDim position for each level:
+    private final int[] splitDimsPos;
+    // true if the per-dim delta we read for the node at this level is a 
negative offset vs. the
+    // last split on this dim; this is a packed
+    // 2D array, i.e. to access array[level][dim] you read from 
negativeDeltas[level*numDims+dim].
+    // this will be true if the last time we
+    // split on this dimension, we next pushed to the left sub-tree:
+    private final boolean[] negativeDeltas;
+    // holds the packed per-level split values
+    private final byte[][] splitValuesStack;
+    // holds the min / max value of the current node.
+    private final byte[] minPackedValue, maxPackedValue;
+    // holds the previous value of the split dimension
+    private final byte[][] splitDimValueStack;
+    // tree parameters
+    private final BKDConfig config;
+    // number of leaves
+    private final int leafNodeOffset;
+    // version of the index
+    private final int version;
+    // last node might not be fully populated
+    private final int lastLeafNodePointCount;
+    // right most leaf node ID
+    private final int rightMostLeafNode;
+    // helper objects for reading doc values
+    private final byte[] scratchDataPackedValue,
+        scratchMinIndexPackedValue,
+        scratchMaxIndexPackedValue;
+    private final int[] commonPrefixLengths;
+    private final BKDReaderDocIDSetIterator scratchIterator;
+
+    private IndexTree(
+        IndexInput innerNodes,
+        IndexInput leafNodes,
+        BKDConfig config,
+        int numLeaves,
+        int version,
+        long pointCount,
+        byte[] minPackedValue,
+        byte[] maxPackedValue)
+        throws IOException {
+      this(
+          innerNodes,
+          leafNodes,
+          config,
+          numLeaves,
+          version,
+          Math.toIntExact(pointCount % config.maxPointsInLeafNode),
+          1,
+          1,
+          minPackedValue,
+          maxPackedValue,
+          new BKDReaderDocIDSetIterator(config.maxPointsInLeafNode),
+          new byte[config.packedBytesLength],
+          new byte[config.packedIndexBytesLength],
+          new byte[config.packedIndexBytesLength],
+          new int[config.numDims]);
+      // read root node
+      readNodeData(false);
+    }
+
+    private IndexTree(
+        IndexInput innerNodes,
+        IndexInput leafNodes,
+        BKDConfig config,
+        int numLeaves,
+        int version,
+        int lastLeafNodePointCount,
+        int nodeID,
+        int level,
+        byte[] minPackedValue,
+        byte[] maxPackedValue,
+        BKDReaderDocIDSetIterator scratchIterator,
+        byte[] scratchDataPackedValue,
+        byte[] scratchMinIndexPackedValue,
+        byte[] scratchMaxIndexPackedValue,
+        int[] commonPrefixLengths) {
+      this.config = config;
+      this.version = version;
+      this.nodeID = nodeID;
+      this.nodeRoot = nodeID;
+      this.level = level;
+      leafNodeOffset = numLeaves;
+      this.innerNodes = innerNodes;
+      this.leafNodes = leafNodes;
+      this.minPackedValue = minPackedValue.clone();
+      this.maxPackedValue = maxPackedValue.clone();
+      // stack arrays that keep information at different levels
+      int treeDepth = getTreeDepth(numLeaves);
+      splitDimValueStack = new byte[treeDepth][];
+      splitValuesStack = new byte[treeDepth][];
+      splitValuesStack[0] = new byte[config.packedIndexBytesLength];
+      leafBlockFPStack = new long[treeDepth + 1];
+      rightNodePositions = new int[treeDepth];
+      splitDimsPos = new int[treeDepth];
+      negativeDeltas = new boolean[config.numIndexDims * treeDepth];
+      // information about the unbalance of the tree so we can report the 
exact size below a node
+      rightMostLeafNode = (1 << treeDepth - 1) - 1;
+      this.lastLeafNodePointCount =
+          lastLeafNodePointCount == 0 ? config.maxPointsInLeafNode : 
lastLeafNodePointCount;
+      // scratch objects, reused between clones so NN search are not creating 
those objects
+      // in every clone.
+      this.scratchIterator = scratchIterator;
+      this.commonPrefixLengths = commonPrefixLengths;
+      this.scratchDataPackedValue = scratchDataPackedValue;
+      this.scratchMinIndexPackedValue = scratchMinIndexPackedValue;
+      this.scratchMaxIndexPackedValue = scratchMaxIndexPackedValue;
+    }
+
+    @Override
+    public BKDReader.IndexTree clone() {
+      BKDDefaultReader.IndexTree index =
+          new BKDDefaultReader.IndexTree(
+              innerNodes.clone(),
+              leafNodes.clone(),
+              config,
+              leafNodeOffset,
+              version,
+              lastLeafNodePointCount,
+              nodeID,
+              level,
+              minPackedValue,
+              maxPackedValue,
+              scratchIterator,
+              scratchDataPackedValue,
+              scratchMinIndexPackedValue,
+              scratchMaxIndexPackedValue,
+              commonPrefixLengths);
+      index.leafBlockFPStack[index.level] = leafBlockFPStack[level];
+      if (isLeafNode() == false) {
+        // copy node data
+        index.rightNodePositions[index.level] = rightNodePositions[level];
+        index.splitValuesStack[index.level] = splitValuesStack[level].clone();
+        System.arraycopy(
+            negativeDeltas,
+            level * config.numIndexDims,
+            index.negativeDeltas,
+            level * config.numIndexDims,
+            config.numIndexDims);
+        index.splitDimsPos[level] = splitDimsPos[level];
+      }
+      return index;
+    }
+
+    @Override
+    public byte[] getMinPackedValue() {
+      return minPackedValue;
+    }
+
+    @Override
+    public byte[] getMaxPackedValue() {
+      return maxPackedValue;
+    }
+
+    @Override
+    public boolean moveToChild() throws IOException {
+      if (isLeafNode()) {
+        return false;
+      }
+      pushBoundsLeft();
+      pushLeft();
+      return true;
+    }
+
+    private void pushBoundsLeft() {
+      final int splitDimPos = splitDimsPos[level];
+      if (splitDimValueStack[level] == null) {
+        splitDimValueStack[level] = new byte[config.bytesPerDim];
+      }
+      // save the dimension we are going to change
+      System.arraycopy(
+          maxPackedValue, splitDimPos, splitDimValueStack[level], 0, 
config.bytesPerDim);
+      assert ArrayUtil.getUnsignedComparator(config.bytesPerDim)
+                  .compare(maxPackedValue, splitDimPos, 
splitValuesStack[level], splitDimPos)
+              >= 0
+          : "config.bytesPerDim="
+              + config.bytesPerDim
+              + " splitDimPos="
+              + splitDimsPos[level]
+              + " config.numIndexDims="
+              + config.numIndexDims
+              + " config.numDims="
+              + config.numDims;
+      // add the split dim value:
+      System.arraycopy(
+          splitValuesStack[level], splitDimPos, maxPackedValue, splitDimPos, 
config.bytesPerDim);
+    }
+
+    private void pushLeft() throws IOException {
+      nodeID *= 2;
+      level++;
+      readNodeData(true);
+    }
+
+    private void pushBoundsRight() {
+      final int splitDimPos = splitDimsPos[level];
+      // we should have already visited the left node
+      assert splitDimValueStack[level] != null;
+      // save the dimension we are going to change
+      System.arraycopy(
+          minPackedValue, splitDimPos, splitDimValueStack[level], 0, 
config.bytesPerDim);
+      assert ArrayUtil.getUnsignedComparator(config.bytesPerDim)
+                  .compare(minPackedValue, splitDimPos, 
splitValuesStack[level], splitDimPos)
+              <= 0
+          : "config.bytesPerDim="
+              + config.bytesPerDim
+              + " splitDimPos="
+              + splitDimsPos[level]
+              + " config.numIndexDims="
+              + config.numIndexDims
+              + " config.numDims="
+              + config.numDims;
+      // add the split dim value:
+      System.arraycopy(
+          splitValuesStack[level], splitDimPos, minPackedValue, splitDimPos, 
config.bytesPerDim);
+    }
+
+    private void pushRight() throws IOException {
+      final int nodePosition = rightNodePositions[level];
+      assert nodePosition >= innerNodes.getFilePointer()
+          : "nodePosition = " + nodePosition + " < currentPosition=" + 
innerNodes.getFilePointer();
+      innerNodes.seek(nodePosition);
+      nodeID = 2 * nodeID + 1;
+      level++;
+      readNodeData(false);
+    }
+
+    @Override
+    public boolean moveToSibling() throws IOException {
+      if (isLeftNode() == false || isRootNode()) {
+        return false;
+      }
+      pop();
+      popBounds(maxPackedValue);
+      pushBoundsRight();
+      pushRight();
+      assert nodeExists();
+      return true;
+    }
+
+    private void pop() {
+      nodeID /= 2;
+      level--;
+    }
+
+    private void popBounds(byte[] packedValue) {
+      // restore the split dimension
+      System.arraycopy(
+          splitDimValueStack[level], 0, packedValue, splitDimsPos[level], 
config.bytesPerDim);
+    }
+
+    @Override
+    public boolean moveToParent() {
+      if (isRootNode()) {
+        return false;
+      }
+      final byte[] packedValue = isLeftNode() ? maxPackedValue : 
minPackedValue;
+      pop();
+      popBounds(packedValue);
+      return true;
+    }
+
+    private boolean isRootNode() {
+      return nodeID == nodeRoot;
+    }
+
+    private boolean isLeftNode() {
+      return (nodeID & 1) == 0;
+    }
+
+    private boolean isLeafNode() {
+      return nodeID >= leafNodeOffset;
+    }
+
+    private boolean nodeExists() {
+      return nodeID - leafNodeOffset < leafNodeOffset;
+    }
+
+    /** Only valid after pushLeft or pushRight, not pop! */
+    private long getLeafBlockFP() {
+      assert isLeafNode() : "nodeID=" + nodeID + " is not a leaf";
+      return leafBlockFPStack[level];
+    }
+
+    @Override
+    public long size() {
+      int leftMostLeafNode = nodeID;
+      while (leftMostLeafNode < leafNodeOffset) {
+        leftMostLeafNode = leftMostLeafNode * 2;
+      }
+      int rightMostLeafNode = nodeID;
+      while (rightMostLeafNode < leafNodeOffset) {
+        rightMostLeafNode = rightMostLeafNode * 2 + 1;
+      }
+      final int numLeaves;
+      if (rightMostLeafNode >= leftMostLeafNode) {
+        // both are on the same level
+        numLeaves = rightMostLeafNode - leftMostLeafNode + 1;
+      } else {
+        // left is one level deeper than right
+        numLeaves = rightMostLeafNode - leftMostLeafNode + 1 + leafNodeOffset;
+      }
+      assert numLeaves == getNumLeavesSlow(nodeID) : numLeaves + " " + 
getNumLeavesSlow(nodeID);
+      return rightMostLeafNode == this.rightMostLeafNode
+          ? (long) (numLeaves - 1) * config.maxPointsInLeafNode + 
lastLeafNodePointCount
+          : (long) numLeaves * config.maxPointsInLeafNode;
+    }
+
+    @Override
+    public void visitDocIDs(PointValues.IntersectVisitor visitor) throws 
IOException {
+      long maxPointCount = size();
+      while (maxPointCount > Integer.MAX_VALUE) {
+        // could be >MAX_VALUE if there are more than 2B points in total
+        visitor.grow(Integer.MAX_VALUE);
+        maxPointCount -= Integer.MAX_VALUE;
+      }

Review comment:
       Ouch, I see. I will modify it back so it looks like the previous 
implementation.




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org



---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org
For additional commands, e-mail: issues-h...@lucene.apache.org

[GitHub] [lucene] iverase commented on a change in pull request #7: LUCENE-9820: Separate logic for reading the BKD index from logic to intersecting it

Reply via email to