dungba88 commented on code in PR #12738: URL: https://github.com/apache/lucene/pull/12738#discussion_r1378882326
########## lucene/core/src/java/org/apache/lucene/util/fst/NodeHash.java: ########## @@ -186,132 +214,99 @@ private long hash(FSTCompiler.UnCompiledNode<T> node) { return h; } - // hash code for a frozen node. this must precisely match the hash computation of an unfrozen - // node! - private long hash(long node) throws IOException { - final int PRIME = 31; - - long h = 0; - fstCompiler.fst.readFirstRealTargetArc(node, scratchArc, in); - while (true) { - h = PRIME * h + scratchArc.label(); - h = PRIME * h + (int) (scratchArc.target() ^ (scratchArc.target() >> 32)); - h = PRIME * h + scratchArc.output().hashCode(); - h = PRIME * h + scratchArc.nextFinalOutput().hashCode(); - if (scratchArc.isFinal()) { - h += 17; - } - if (scratchArc.isLast()) { - break; - } - fstCompiler.fst.readNextRealArc(scratchArc, in); - } - - return h; - } - - /** - * Compares an unfrozen node (UnCompiledNode) with a frozen node at byte location address (long), - * returning true if they are equal. - */ - private boolean nodesEqual(FSTCompiler.UnCompiledNode<T> node, long address) throws IOException { - fstCompiler.fst.readFirstRealTargetArc(address, scratchArc, in); - - // fail fast for a node with fixed length arcs - if (scratchArc.bytesPerArc() != 0) { - assert node.numArcs > 0; - // the frozen node uses fixed-with arc encoding (same number of bytes per arc), but may be - // sparse or dense - switch (scratchArc.nodeFlags()) { - case FST.ARCS_FOR_BINARY_SEARCH: - // sparse - if (node.numArcs != scratchArc.numArcs()) { - return false; - } - break; - case FST.ARCS_FOR_DIRECT_ADDRESSING: - // dense -- compare both the number of labels allocated in the array (some of which may - // not actually be arcs), and the number of arcs - if ((node.arcs[node.numArcs - 1].label - node.arcs[0].label + 1) != scratchArc.numArcs() - || node.numArcs != FST.Arc.BitTable.countBits(scratchArc, in)) { - return false; - } - break; - default: - throw new AssertionError("unhandled scratchArc.nodeFlag() " + scratchArc.nodeFlags()); - } - } - - // compare arc by arc to see if there is a difference - for (int arcUpto = 0; arcUpto < node.numArcs; arcUpto++) { - final FSTCompiler.Arc<T> arc = node.arcs[arcUpto]; - if (arc.label != scratchArc.label() - || arc.output.equals(scratchArc.output()) == false - || ((FSTCompiler.CompiledNode) arc.target).node != scratchArc.target() - || arc.nextFinalOutput.equals(scratchArc.nextFinalOutput()) == false - || arc.isFinal != scratchArc.isFinal()) { - return false; - } - - if (scratchArc.isLast()) { - if (arcUpto == node.numArcs - 1) { - return true; - } else { - return false; - } - } - - fstCompiler.fst.readNextRealArc(scratchArc, in); - } - - // unfrozen node has fewer arcs than frozen node - - return false; - } - /** Inner class because it needs access to hash function and FST bytes. */ private class PagedGrowableHash { - private PagedGrowableWriter entries; + public long copiedBytes; + // storing the FST node address where the position is the masked hash of the node arcs Review Comment: > I was thinking we could do this instead with two PagedGrowableWriter, both of which being addressed by the hash(fstNodeAddress) % mask? It's a bit messy as that means we need to know the masked hash of the node when computing the hash. So I put that masked hash (computed using the unfrozen nodes) into the hash function. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org