Re: [PR] Use value-based LRU cache in NodeHash [lucene]

via GitHub Wed, 01 Nov 2023 04:00:51 -0700


mikemccand commented on code in PR #12738:
URL: https://github.com/apache/lucene/pull/12738#discussion_r1378650643



##########
lucene/core/src/java/org/apache/lucene/util/fst/NodeHash.java:
##########
@@ -186,119 +197,85 @@ private long hash(FSTCompiler.UnCompiledNode<T> node) {
     return h;
   }
 
-  // hash code for a frozen node.  this must precisely match the hash 
computation of an unfrozen
-  // node!
-  private long hash(long node) throws IOException {
-    final int PRIME = 31;
-
-    long h = 0;
-    fstCompiler.fst.readFirstRealTargetArc(node, scratchArc, in);
-    while (true) {
-      h = PRIME * h + scratchArc.label();
-      h = PRIME * h + (int) (scratchArc.target() ^ (scratchArc.target() >> 
32));
-      h = PRIME * h + scratchArc.output().hashCode();
-      h = PRIME * h + scratchArc.nextFinalOutput().hashCode();
-      if (scratchArc.isFinal()) {
-        h += 17;
-      }
-      if (scratchArc.isLast()) {
-        break;
-      }
-      fstCompiler.fst.readNextRealArc(scratchArc, in);
-    }
-
-    return h;
-  }
-
-  /**
-   * Compares an unfrozen node (UnCompiledNode) with a frozen node at byte 
location address (long),
-   * returning true if they are equal.
-   */
-  private boolean nodesEqual(FSTCompiler.UnCompiledNode<T> node, long address) 
throws IOException {
-    fstCompiler.fst.readFirstRealTargetArc(address, scratchArc, in);
-
-    // fail fast for a node with fixed length arcs
-    if (scratchArc.bytesPerArc() != 0) {
-      assert node.numArcs > 0;
-      // the frozen node uses fixed-with arc encoding (same number of bytes 
per arc), but may be
-      // sparse or dense
-      switch (scratchArc.nodeFlags()) {
-        case FST.ARCS_FOR_BINARY_SEARCH:
-          // sparse
-          if (node.numArcs != scratchArc.numArcs()) {
-            return false;
-          }
-          break;
-        case FST.ARCS_FOR_DIRECT_ADDRESSING:
-          // dense -- compare both the number of labels allocated in the array 
(some of which may
-          // not actually be arcs), and the number of arcs
-          if ((node.arcs[node.numArcs - 1].label - node.arcs[0].label + 1) != 
scratchArc.numArcs()
-              || node.numArcs != FST.Arc.BitTable.countBits(scratchArc, in)) {
-            return false;
-          }
-          break;
-        default:
-          throw new AssertionError("unhandled scratchArc.nodeFlag() " + 
scratchArc.nodeFlags());
-      }
-    }
-
-    // compare arc by arc to see if there is a difference
-    for (int arcUpto = 0; arcUpto < node.numArcs; arcUpto++) {
-      final FSTCompiler.Arc<T> arc = node.arcs[arcUpto];
-      if (arc.label != scratchArc.label()
-          || arc.output.equals(scratchArc.output()) == false
-          || ((FSTCompiler.CompiledNode) arc.target).node != 
scratchArc.target()
-          || arc.nextFinalOutput.equals(scratchArc.nextFinalOutput()) == false
-          || arc.isFinal != scratchArc.isFinal()) {
-        return false;
-      }
-
-      if (scratchArc.isLast()) {
-        if (arcUpto == node.numArcs - 1) {
-          return true;
-        } else {
-          return false;
-        }
-      }
-
-      fstCompiler.fst.readNextRealArc(scratchArc, in);
-    }
-
-    // unfrozen node has fewer arcs than frozen node
-
-    return false;
-  }
-
   /** Inner class because it needs access to hash function and FST bytes. */
   private class PagedGrowableHash {
+    public long copiedBytes;
     private PagedGrowableWriter entries;
+    // mapping from FST real address to copiedNodes offsets
+    private PagedGrowableWriter copiedOffsets;
     private long count;
+    // mask for entries
     private long mask;
+    // mask for copiedOffsets
+    private long offsetMask;
+    private final List<byte[]> copiedNodes;
 
     // 256K blocks, but note that the final block is sized only as needed so 
it won't use the full
     // block size when just a few elements were written to it
     private static final int BLOCK_SIZE_BYTES = 1 << 18;
 
     public PagedGrowableHash() {
       entries = new PagedGrowableWriter(16, BLOCK_SIZE_BYTES, 8, 
PackedInts.COMPACT);
+      copiedOffsets = new PagedGrowableWriter(32, BLOCK_SIZE_BYTES, 8, 
PackedInts.COMPACT);
       mask = 15;
+      offsetMask = 31;
+      copiedNodes = new ArrayList<>();
     }
 
     public PagedGrowableHash(long lastNodeAddress, long size) {
       entries =
           new PagedGrowableWriter(
               size, BLOCK_SIZE_BYTES, 
PackedInts.bitsRequired(lastNodeAddress), PackedInts.COMPACT);
+      copiedOffsets =
+          new PagedGrowableWriter(
+              size * 2,
+              BLOCK_SIZE_BYTES,
+              PackedInts.bitsRequired(lastNodeAddress),
+              PackedInts.COMPACT);
       mask = size - 1;
+      offsetMask = size * 2 - 1;

Review Comment:
   Hmm -- why are `entries` and `copiedOffsets` not simply side-by-side values 
arrays for the hash map?  Why is `copiedOffsets` 2X the size?  It seems like we 
could have them precisely match (side by side value arrays for the same hash 
entry)?



##########
lucene/core/src/java/org/apache/lucene/util/fst/NodeHash.java:
##########
@@ -186,119 +197,85 @@ private long hash(FSTCompiler.UnCompiledNode<T> node) {
     return h;
   }
 
-  // hash code for a frozen node.  this must precisely match the hash 
computation of an unfrozen
-  // node!
-  private long hash(long node) throws IOException {
-    final int PRIME = 31;
-
-    long h = 0;
-    fstCompiler.fst.readFirstRealTargetArc(node, scratchArc, in);
-    while (true) {
-      h = PRIME * h + scratchArc.label();
-      h = PRIME * h + (int) (scratchArc.target() ^ (scratchArc.target() >> 
32));
-      h = PRIME * h + scratchArc.output().hashCode();
-      h = PRIME * h + scratchArc.nextFinalOutput().hashCode();
-      if (scratchArc.isFinal()) {
-        h += 17;
-      }
-      if (scratchArc.isLast()) {
-        break;
-      }
-      fstCompiler.fst.readNextRealArc(scratchArc, in);
-    }
-
-    return h;
-  }
-
-  /**
-   * Compares an unfrozen node (UnCompiledNode) with a frozen node at byte 
location address (long),
-   * returning true if they are equal.
-   */
-  private boolean nodesEqual(FSTCompiler.UnCompiledNode<T> node, long address) 
throws IOException {
-    fstCompiler.fst.readFirstRealTargetArc(address, scratchArc, in);
-
-    // fail fast for a node with fixed length arcs
-    if (scratchArc.bytesPerArc() != 0) {
-      assert node.numArcs > 0;
-      // the frozen node uses fixed-with arc encoding (same number of bytes 
per arc), but may be
-      // sparse or dense
-      switch (scratchArc.nodeFlags()) {
-        case FST.ARCS_FOR_BINARY_SEARCH:
-          // sparse
-          if (node.numArcs != scratchArc.numArcs()) {
-            return false;
-          }
-          break;
-        case FST.ARCS_FOR_DIRECT_ADDRESSING:
-          // dense -- compare both the number of labels allocated in the array 
(some of which may
-          // not actually be arcs), and the number of arcs
-          if ((node.arcs[node.numArcs - 1].label - node.arcs[0].label + 1) != 
scratchArc.numArcs()
-              || node.numArcs != FST.Arc.BitTable.countBits(scratchArc, in)) {
-            return false;
-          }
-          break;
-        default:
-          throw new AssertionError("unhandled scratchArc.nodeFlag() " + 
scratchArc.nodeFlags());
-      }
-    }
-
-    // compare arc by arc to see if there is a difference
-    for (int arcUpto = 0; arcUpto < node.numArcs; arcUpto++) {
-      final FSTCompiler.Arc<T> arc = node.arcs[arcUpto];
-      if (arc.label != scratchArc.label()
-          || arc.output.equals(scratchArc.output()) == false
-          || ((FSTCompiler.CompiledNode) arc.target).node != 
scratchArc.target()
-          || arc.nextFinalOutput.equals(scratchArc.nextFinalOutput()) == false
-          || arc.isFinal != scratchArc.isFinal()) {
-        return false;
-      }
-
-      if (scratchArc.isLast()) {
-        if (arcUpto == node.numArcs - 1) {
-          return true;
-        } else {
-          return false;
-        }
-      }
-
-      fstCompiler.fst.readNextRealArc(scratchArc, in);
-    }
-
-    // unfrozen node has fewer arcs than frozen node
-
-    return false;
-  }
-
   /** Inner class because it needs access to hash function and FST bytes. */
   private class PagedGrowableHash {
+    public long copiedBytes;
     private PagedGrowableWriter entries;
+    // mapping from FST real address to copiedNodes offsets
+    private PagedGrowableWriter copiedOffsets;
     private long count;
+    // mask for entries
     private long mask;
+    // mask for copiedOffsets
+    private long offsetMask;
+    private final List<byte[]> copiedNodes;
 
     // 256K blocks, but note that the final block is sized only as needed so 
it won't use the full
     // block size when just a few elements were written to it
     private static final int BLOCK_SIZE_BYTES = 1 << 18;
 
     public PagedGrowableHash() {
       entries = new PagedGrowableWriter(16, BLOCK_SIZE_BYTES, 8, 
PackedInts.COMPACT);
+      copiedOffsets = new PagedGrowableWriter(32, BLOCK_SIZE_BYTES, 8, 
PackedInts.COMPACT);
       mask = 15;
+      offsetMask = 31;
+      copiedNodes = new ArrayList<>();
     }
 
     public PagedGrowableHash(long lastNodeAddress, long size) {
       entries =
           new PagedGrowableWriter(
               size, BLOCK_SIZE_BYTES, 
PackedInts.bitsRequired(lastNodeAddress), PackedInts.COMPACT);
+      copiedOffsets =
+          new PagedGrowableWriter(
+              size * 2,
+              BLOCK_SIZE_BYTES,
+              PackedInts.bitsRequired(lastNodeAddress),
+              PackedInts.COMPACT);
       mask = size - 1;
+      offsetMask = size * 2 - 1;
       assert (mask & size) == 0 : "size must be a power-of-2; got size=" + 
size + " mask=" + mask;
+      copiedNodes = new ArrayList<>();
+    }
+
+    public byte[] getBytes(long node) {
+      long pos = offsetHash(node) & offsetMask;
+      while (true) {
+        long value = copiedOffsets.get(pos);
+        assert value != 0;
+        if (value == node) {
+          return copiedNodes.get(Math.toIntExact(copiedOffsets.get(pos + 1)));
+        }
+        pos = (pos + 2) & offsetMask;
+      }
     }
 
     public long get(long index) {
       return entries.get(index);
     }
 
-    public void set(long index, long pointer) throws IOException {
+    public void set(long index, long pointer, byte[] bytes) {
       entries.set(index, pointer);
+      copiedNodes.add(bytes);
+      setOffset(pointer);
       count++;
+      copiedBytes += bytes.length;
+    }
+
+    private void setOffset(long pointer) {
+      long pos = offsetHash(pointer) & offsetMask;
+      // find an empty slot
+      while (copiedOffsets.get(pos) != 0) {
+        pos = (pos + 2) & offsetMask;
+      }
+      copiedOffsets.set(pos, pointer); // write the address
+      copiedOffsets.set(pos + 1, copiedNodes.size() - 1); // write the offsets

Review Comment:
   Oh!  I see.  Could we instead use side-by-side value array, instead of 
packing 2X into one?  We'll get better compression that way, since the scope of 
the two sets of values is very different.



##########
lucene/core/src/java/org/apache/lucene/util/fst/NodeHash.java:
##########
@@ -186,119 +193,85 @@ private long hash(FSTCompiler.UnCompiledNode<T> node) {
     return h;
   }
 
-  // hash code for a frozen node.  this must precisely match the hash 
computation of an unfrozen
-  // node!
-  private long hash(long node) throws IOException {
-    final int PRIME = 31;
-
-    long h = 0;
-    fstCompiler.fst.readFirstRealTargetArc(node, scratchArc, in);
-    while (true) {
-      h = PRIME * h + scratchArc.label();
-      h = PRIME * h + (int) (scratchArc.target() ^ (scratchArc.target() >> 
32));
-      h = PRIME * h + scratchArc.output().hashCode();
-      h = PRIME * h + scratchArc.nextFinalOutput().hashCode();
-      if (scratchArc.isFinal()) {
-        h += 17;
-      }
-      if (scratchArc.isLast()) {
-        break;
-      }
-      fstCompiler.fst.readNextRealArc(scratchArc, in);
-    }
-
-    return h;
-  }
-
-  /**
-   * Compares an unfrozen node (UnCompiledNode) with a frozen node at byte 
location address (long),
-   * returning true if they are equal.
-   */
-  private boolean nodesEqual(FSTCompiler.UnCompiledNode<T> node, long address) 
throws IOException {
-    fstCompiler.fst.readFirstRealTargetArc(address, scratchArc, in);
-
-    // fail fast for a node with fixed length arcs
-    if (scratchArc.bytesPerArc() != 0) {
-      assert node.numArcs > 0;
-      // the frozen node uses fixed-with arc encoding (same number of bytes 
per arc), but may be
-      // sparse or dense
-      switch (scratchArc.nodeFlags()) {
-        case FST.ARCS_FOR_BINARY_SEARCH:
-          // sparse
-          if (node.numArcs != scratchArc.numArcs()) {
-            return false;
-          }
-          break;
-        case FST.ARCS_FOR_DIRECT_ADDRESSING:
-          // dense -- compare both the number of labels allocated in the array 
(some of which may
-          // not actually be arcs), and the number of arcs
-          if ((node.arcs[node.numArcs - 1].label - node.arcs[0].label + 1) != 
scratchArc.numArcs()
-              || node.numArcs != FST.Arc.BitTable.countBits(scratchArc, in)) {
-            return false;
-          }
-          break;
-        default:
-          throw new AssertionError("unhandled scratchArc.nodeFlag() " + 
scratchArc.nodeFlags());
-      }
-    }
-
-    // compare arc by arc to see if there is a difference
-    for (int arcUpto = 0; arcUpto < node.numArcs; arcUpto++) {
-      final FSTCompiler.Arc<T> arc = node.arcs[arcUpto];
-      if (arc.label != scratchArc.label()
-          || arc.output.equals(scratchArc.output()) == false
-          || ((FSTCompiler.CompiledNode) arc.target).node != 
scratchArc.target()
-          || arc.nextFinalOutput.equals(scratchArc.nextFinalOutput()) == false
-          || arc.isFinal != scratchArc.isFinal()) {
-        return false;
-      }
-
-      if (scratchArc.isLast()) {
-        if (arcUpto == node.numArcs - 1) {
-          return true;
-        } else {
-          return false;
-        }
-      }
-
-      fstCompiler.fst.readNextRealArc(scratchArc, in);
-    }
-
-    // unfrozen node has fewer arcs than frozen node
-
-    return false;
-  }
-
   /** Inner class because it needs access to hash function and FST bytes. */
   private class PagedGrowableHash {
+    public long copiedBytes;
     private PagedGrowableWriter entries;
+    // mapping from FST real address to copiedNodes offsets
+    private PagedGrowableWriter copiedOffsets;
     private long count;
+    // mask for entries
     private long mask;
+    // mask for copiedOffsets
+    private long offsetMask;
+    private final List<byte[]> copiedNodes;

Review Comment:
   > * When we call `append(BytesRef)`, the byte[] will be copied into the 
`ByteBlockPool`. We already had 1 copy when (1) reading from the BytesStore on 
adding node and (2) reading from the fallback table on promotion.
   
   Couldn't we copy directly from FST's byte store into the underlying 
`ByteBlockPool` used by the hash once we (freeze) `addNode`?
   
   > Similarly calling get(BytesRef, int) would also require a copy from the 
ByteBlockPool into a temporary byte[]
   
   I'm not sure why that entails a copy -- seems like you can read the slice 
`byte[]` from the pool to compute `nodeEquals`.  If it is equal, great, you 
have the `offset` to look up, or, you are in fallback and you promote (pool to 
pool copy).  So there is one added copy in that case, but it is well amortized 
since it leads to smaller FST.
   
   Net/net I'm worried more about the added GC/RAM cost of `List<byte[]>` and 
less so about the added cost of copying.  If we must use a temporary `byte[]` 
when copy pool to pool, for simplicity, I think that's fine on first go: we can 
optimize / reduce these copies later.



##########
lucene/core/src/java/org/apache/lucene/util/fst/NodeHash.java:
##########
@@ -186,119 +194,85 @@ private long hash(FSTCompiler.UnCompiledNode<T> node) {
     return h;
   }
 
-  // hash code for a frozen node.  this must precisely match the hash 
computation of an unfrozen

Review Comment:
   Table dependent because each must consult its own `byte[]` store right?  I 
think this make sense.



##########
lucene/core/src/java/org/apache/lucene/util/fst/NodeHash.java:
##########
@@ -186,119 +194,85 @@ private long hash(FSTCompiler.UnCompiledNode<T> node) {
     return h;
   }
 
-  // hash code for a frozen node.  this must precisely match the hash 
computation of an unfrozen
-  // node!
-  private long hash(long node) throws IOException {
-    final int PRIME = 31;
-
-    long h = 0;
-    fstCompiler.fst.readFirstRealTargetArc(node, scratchArc, in);
-    while (true) {
-      h = PRIME * h + scratchArc.label();
-      h = PRIME * h + (int) (scratchArc.target() ^ (scratchArc.target() >> 
32));
-      h = PRIME * h + scratchArc.output().hashCode();
-      h = PRIME * h + scratchArc.nextFinalOutput().hashCode();
-      if (scratchArc.isFinal()) {
-        h += 17;
-      }
-      if (scratchArc.isLast()) {
-        break;
-      }
-      fstCompiler.fst.readNextRealArc(scratchArc, in);
-    }
-
-    return h;
-  }
-
-  /**
-   * Compares an unfrozen node (UnCompiledNode) with a frozen node at byte 
location address (long),
-   * returning true if they are equal.
-   */
-  private boolean nodesEqual(FSTCompiler.UnCompiledNode<T> node, long address) 
throws IOException {
-    fstCompiler.fst.readFirstRealTargetArc(address, scratchArc, in);
-
-    // fail fast for a node with fixed length arcs
-    if (scratchArc.bytesPerArc() != 0) {
-      assert node.numArcs > 0;
-      // the frozen node uses fixed-with arc encoding (same number of bytes 
per arc), but may be
-      // sparse or dense
-      switch (scratchArc.nodeFlags()) {
-        case FST.ARCS_FOR_BINARY_SEARCH:
-          // sparse
-          if (node.numArcs != scratchArc.numArcs()) {
-            return false;
-          }
-          break;
-        case FST.ARCS_FOR_DIRECT_ADDRESSING:
-          // dense -- compare both the number of labels allocated in the array 
(some of which may
-          // not actually be arcs), and the number of arcs
-          if ((node.arcs[node.numArcs - 1].label - node.arcs[0].label + 1) != 
scratchArc.numArcs()
-              || node.numArcs != FST.Arc.BitTable.countBits(scratchArc, in)) {
-            return false;
-          }
-          break;
-        default:
-          throw new AssertionError("unhandled scratchArc.nodeFlag() " + 
scratchArc.nodeFlags());
-      }
-    }
-
-    // compare arc by arc to see if there is a difference
-    for (int arcUpto = 0; arcUpto < node.numArcs; arcUpto++) {
-      final FSTCompiler.Arc<T> arc = node.arcs[arcUpto];
-      if (arc.label != scratchArc.label()
-          || arc.output.equals(scratchArc.output()) == false
-          || ((FSTCompiler.CompiledNode) arc.target).node != 
scratchArc.target()
-          || arc.nextFinalOutput.equals(scratchArc.nextFinalOutput()) == false
-          || arc.isFinal != scratchArc.isFinal()) {
-        return false;
-      }
-
-      if (scratchArc.isLast()) {
-        if (arcUpto == node.numArcs - 1) {
-          return true;
-        } else {
-          return false;
-        }
-      }
-
-      fstCompiler.fst.readNextRealArc(scratchArc, in);
-    }
-
-    // unfrozen node has fewer arcs than frozen node
-
-    return false;
-  }
-
   /** Inner class because it needs access to hash function and FST bytes. */
   private class PagedGrowableHash {
+    public long copiedBytes;
     private PagedGrowableWriter entries;
+    // mapping from FST real address to copiedNodes offsets
+    private PagedGrowableWriter copiedOffsets;
     private long count;
+    // mask for entries
     private long mask;
+    // mask for copiedOffsets
+    private long offsetMask;
+    private final List<byte[]> copiedNodes;
 
     // 256K blocks, but note that the final block is sized only as needed so 
it won't use the full
     // block size when just a few elements were written to it
     private static final int BLOCK_SIZE_BYTES = 1 << 18;
 
     public PagedGrowableHash() {
       entries = new PagedGrowableWriter(16, BLOCK_SIZE_BYTES, 8, 
PackedInts.COMPACT);
+      copiedOffsets = new PagedGrowableWriter(32, BLOCK_SIZE_BYTES, 8, 
PackedInts.COMPACT);

Review Comment:
   Hmm, I see: we need two long values stored per entry?  One for the true FST 
appending `byte[]` offset, and another for the local pool'd copy of just the 
`byte[]` for this node?  Maybe rename `entries` to `fstNodeAddress` and then 
`copiedNodeAddress` for the pool'd offsets?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org
For additional commands, e-mail: issues-h...@lucene.apache.org

Re: [PR] Use value-based LRU cache in NodeHash [lucene]

Reply via email to