mikemccand commented on code in PR #12738: URL: https://github.com/apache/lucene/pull/12738#discussion_r1381812079
########## lucene/core/src/java/org/apache/lucene/util/fst/NodeHash.java: ########## @@ -186,149 +218,228 @@ private long hash(FSTCompiler.UnCompiledNode<T> node) { return h; } - // hash code for a frozen node. this must precisely match the hash computation of an unfrozen - // node! - private long hash(long node) throws IOException { - final int PRIME = 31; - - long h = 0; - fstCompiler.fst.readFirstRealTargetArc(node, scratchArc, in); - while (true) { - h = PRIME * h + scratchArc.label(); - h = PRIME * h + (int) (scratchArc.target() ^ (scratchArc.target() >> 32)); - h = PRIME * h + scratchArc.output().hashCode(); - h = PRIME * h + scratchArc.nextFinalOutput().hashCode(); - if (scratchArc.isFinal()) { - h += 17; - } - if (scratchArc.isLast()) { - break; - } - fstCompiler.fst.readNextRealArc(scratchArc, in); - } - - return h; - } - - /** - * Compares an unfrozen node (UnCompiledNode) with a frozen node at byte location address (long), - * returning true if they are equal. - */ - private boolean nodesEqual(FSTCompiler.UnCompiledNode<T> node, long address) throws IOException { - fstCompiler.fst.readFirstRealTargetArc(address, scratchArc, in); - - // fail fast for a node with fixed length arcs - if (scratchArc.bytesPerArc() != 0) { - assert node.numArcs > 0; - // the frozen node uses fixed-with arc encoding (same number of bytes per arc), but may be - // sparse or dense - switch (scratchArc.nodeFlags()) { - case FST.ARCS_FOR_BINARY_SEARCH: - // sparse - if (node.numArcs != scratchArc.numArcs()) { - return false; - } - break; - case FST.ARCS_FOR_DIRECT_ADDRESSING: - // dense -- compare both the number of labels allocated in the array (some of which may - // not actually be arcs), and the number of arcs - if ((node.arcs[node.numArcs - 1].label - node.arcs[0].label + 1) != scratchArc.numArcs() - || node.numArcs != FST.Arc.BitTable.countBits(scratchArc, in)) { - return false; - } - break; - default: - throw new AssertionError("unhandled scratchArc.nodeFlag() " + scratchArc.nodeFlags()); - } - } - - // compare arc by arc to see if there is a difference - for (int arcUpto = 0; arcUpto < node.numArcs; arcUpto++) { - final FSTCompiler.Arc<T> arc = node.arcs[arcUpto]; - if (arc.label != scratchArc.label() - || arc.output.equals(scratchArc.output()) == false - || ((FSTCompiler.CompiledNode) arc.target).node != scratchArc.target() - || arc.nextFinalOutput.equals(scratchArc.nextFinalOutput()) == false - || arc.isFinal != scratchArc.isFinal()) { - return false; - } - - if (scratchArc.isLast()) { - if (arcUpto == node.numArcs - 1) { - return true; - } else { - return false; - } - } - - fstCompiler.fst.readNextRealArc(scratchArc, in); - } - - // unfrozen node has fewer arcs than frozen node - - return false; - } - /** Inner class because it needs access to hash function and FST bytes. */ private class PagedGrowableHash { - private PagedGrowableWriter entries; + // total bytes copied out of the FST byte[] into this hash table ByteBlockPool for suffix nodes + private long copiedBytes; + // storing the FST node address where the position is the masked hash of the node arcs + private PagedGrowableWriter fstNodeAddress; + // storing the local copiedNodes address in the same position as fstNodeAddress + // here we are effectively storing a Map<Long, Long> from the FST node address to copiedNodes + // address + private PagedGrowableWriter copiedNodeAddress; private long count; private long mask; + // storing the byte slice from the FST for nodes we added to the hash so that we don't need to + // look up from the FST itself, so the FST bytes can stream directly to disk as append-only + // writes. + // each node will be written subsequently + private final ByteBlockPool copiedNodes; + // the {@link FST.BytesReader} to read from copiedNodes. we use this when computing a frozen + // node hash + // or comparing if a frozen and unfrozen nodes are equal + private final ByteBlockPoolReverseBytesReader bytesReader; // 256K blocks, but note that the final block is sized only as needed so it won't use the full // block size when just a few elements were written to it private static final int BLOCK_SIZE_BYTES = 1 << 18; public PagedGrowableHash() { - entries = new PagedGrowableWriter(16, BLOCK_SIZE_BYTES, 8, PackedInts.COMPACT); + fstNodeAddress = new PagedGrowableWriter(16, BLOCK_SIZE_BYTES, 8, PackedInts.COMPACT); + copiedNodeAddress = new PagedGrowableWriter(16, BLOCK_SIZE_BYTES, 8, PackedInts.COMPACT); mask = 15; + copiedNodes = new ByteBlockPool(new ByteBlockPool.DirectAllocator()); + bytesReader = new ByteBlockPoolReverseBytesReader(copiedNodes); } public PagedGrowableHash(long lastNodeAddress, long size) { - entries = + fstNodeAddress = new PagedGrowableWriter( size, BLOCK_SIZE_BYTES, PackedInts.bitsRequired(lastNodeAddress), PackedInts.COMPACT); + copiedNodeAddress = new PagedGrowableWriter(size, BLOCK_SIZE_BYTES, 8, PackedInts.COMPACT); mask = size - 1; assert (mask & size) == 0 : "size must be a power-of-2; got size=" + size + " mask=" + mask; + copiedNodes = new ByteBlockPool(new ByteBlockPool.DirectAllocator()); + bytesReader = new ByteBlockPoolReverseBytesReader(copiedNodes); } - public long get(long index) { - return entries.get(index); + /** + * Get the copied bytes at the provided hash slot + * + * @param hashSlot the hash slot to read from + * @param length the number of bytes to read + * @return the copied byte array + */ + public byte[] getBytes(long hashSlot, int length) { + long address = copiedNodeAddress.get(hashSlot); + assert address - length + 1 >= 0; + byte[] buf = new byte[length]; + copiedNodes.readBytes(address - length + 1, buf, 0, length); + return buf; } - public void set(long index, long pointer) throws IOException { - entries.set(index, pointer); + /** + * Get the node address from the provided hash slot + * + * @param hashSlot the hash slot to read + * @return the node address + */ + public long getNodeAddress(long hashSlot) { + return fstNodeAddress.get(hashSlot); + } + + /** + * Set the node address and bytes from the provided hash slot + * + * @param hashSlot the hash slot to write to + * @param nodeAddress the node address + * @param bytes the node bytes to be copied + */ + public void setNode(long hashSlot, long nodeAddress, byte[] bytes) { + fstNodeAddress.set(hashSlot, nodeAddress); count++; + copiedNodes.append(bytes); + copiedBytes += bytes.length; Review Comment: > I could add that method in ByteBlockPool as well. +1 to add it LOL. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org