Re: [PR] [WIP] first cut at bounding the NodeHash size during FST compilation [lucene]

via GitHub Tue, 17 Oct 2023 20:27:31 -0700


dungba88 commented on code in PR #12633:
URL: https://github.com/apache/lucene/pull/12633#discussion_r1363098628



##########
lucene/core/src/java/org/apache/lucene/util/fst/NodeHash.java:
##########
@@ -17,79 +17,177 @@
 package org.apache.lucene.util.fst;
 
 import java.io.IOException;
-import org.apache.lucene.util.packed.PackedInts;
+import java.util.Arrays;
+import java.util.Locale;
 import org.apache.lucene.util.packed.PagedGrowableWriter;
+import org.apache.lucene.util.packed.PackedInts;;
+
+// nocommit we could gradually grow the double-barrel hash size based on size 
of the growing FST?  hmm does not work so well since you
+// failed to use the larger hash in the beginning?  it's fundamentally needing 
two passes?
+
+// TODO: any way to make a reversee suffix lookup (msokolov's idea) instead of 
more costly hash?  hmmm, though, hash is not so wasteful
+// since it does not have to store value of each entry: the value is the node 
pointer in the FST.  actually, there is much to save
+// there -- we would not need any long per entry -- we'd be able to start at 
the FST end node and work backwards from the transitions
+
+// TODO: couldn't we prune natrually babck until we see a transition with an 
output?  it's highly unlikely (mostly impossible) such suffixes
+// can be shared?
 
 // Used to dedup states (lookup already-frozen states)
 final class NodeHash<T> {
 
-  private PagedGrowableWriter table;
-  private long count;
-  private long mask;
+  // nocommit
+  private static final boolean DO_PRINT_HASH_RAM = true;
+
+  // primary table -- we add nodes into this until it reaches the requested 
tableSizeLimit/2, then we move it to fallback
+  private PagedGrowableHash primaryTable;
+
+  // how many nodes are allowed to store in both primary and fallback tables; 
when primary gets full (tableSizeLimit/2), we move it to the
+  // fallback table
+  private final long ramLimitBytes;
+
+  // fallback table.  if we fallback and find the frozen node here, we promote 
it to primary table, for a simplistic and lowish-RAM-overhead
+  // (compared to e.g. LinkedHashMap) LRU behaviour
+  private PagedGrowableHash fallbackTable;
+
   private final FST<T> fst;
   private final FST.Arc<T> scratchArc = new FST.Arc<>();
   private final FST.BytesReader in;
 
-  public NodeHash(FST<T> fst, FST.BytesReader in) {
-    table = new PagedGrowableWriter(16, 1 << 27, 8, PackedInts.COMPACT);
-    mask = 15;
+  /** ramLimitMB is the max RAM we can use for recording suffixes. If we hit 
this limit, the least recently used suffixes are discarded, and
+   *  the FST is no longer minimalI.  Still, larger ramLimitMB will make the 
FST smaller (closer to minimal). */
+  public NodeHash(FST<T> fst, double ramLimitMB, FST.BytesReader in) {
+    if (ramLimitMB <= 0) {
+      throw new IllegalArgumentException("ramLimitMB must be > 0; got: " + 
ramLimitMB);
+    }
+    double asBytes = ramLimitMB * 1024 * 1024;
+    if (asBytes >= Long.MAX_VALUE) {
+      // quietly truncate to Long.MAX_VALUE in bytes too
+      ramLimitBytes = Long.MAX_VALUE;
+    } else {
+      ramLimitBytes = (long) asBytes;
+    }
+    
+    primaryTable = new PagedGrowableHash();
     this.fst = fst;
     this.in = in;
   }
 
-  private boolean nodesEqual(FSTCompiler.UnCompiledNode<T> node, long address) 
throws IOException {
-    fst.readFirstRealTargetArc(address, scratchArc, in);
-
-    // Fail fast for a node with fixed length arcs.
-    if (scratchArc.bytesPerArc() != 0) {
-      if (scratchArc.nodeFlags() == FST.ARCS_FOR_BINARY_SEARCH) {
-        if (node.numArcs != scratchArc.numArcs()) {
-          return false;
-        }
-      } else {
-        assert scratchArc.nodeFlags() == FST.ARCS_FOR_DIRECT_ADDRESSING;
-        if ((node.arcs[node.numArcs - 1].label - node.arcs[0].label + 1) != 
scratchArc.numArcs()
-            || node.numArcs != FST.Arc.BitTable.countBits(scratchArc, in)) {
-          return false;
-        }
+  // nocommit measure how wasteful/conflicty these hash tables are.  should we 
improve hash function?
+  
+  private long getFallback(FSTCompiler.UnCompiledNode<T> nodeIn, long hash) 
throws IOException {
+    if (fallbackTable == null) {
+      // no fallback yet (primary table is not yet large enough to swap)
+      return 0;
+    }
+    long pos = hash & fallbackTable.mask;
+    int c = 0;
+    while (true) {
+      long node = fallbackTable.get(pos);
+      if (node == 0) {
+        // not found
+        return 0;
+      } else if (nodesEqual(nodeIn, node)) {
+        // frozen version of this node is already here
+        return node;
       }
+
+      // nocommit -- is this really quadratic probe?
+      // quadratic probe
+      pos = (pos + (++c)) & fallbackTable.mask;
     }
+  }
 
-    for (int arcUpto = 0; arcUpto < node.numArcs; arcUpto++) {
-      final FSTCompiler.Arc<T> arc = node.arcs[arcUpto];
-      if (arc.label != scratchArc.label()
-          || !arc.output.equals(scratchArc.output())
-          || ((FSTCompiler.CompiledNode) arc.target).node != 
scratchArc.target()
-          || !arc.nextFinalOutput.equals(scratchArc.nextFinalOutput())
-          || arc.isFinal != scratchArc.isFinal()) {
-        return false;
-      }
+  public long add(FSTCompiler<T> fstCompiler, FSTCompiler.UnCompiledNode<T> 
nodeIn)
+      throws IOException {
 
-      if (scratchArc.isLast()) {
-        if (arcUpto == node.numArcs - 1) {
-          return true;
+    long hash = hash(nodeIn);
+    
+    long pos = hash & primaryTable.mask;
+    int c = 0;
+
+    while (true) {
+
+      long node = primaryTable.get(pos);
+      if (node == 0) {
+        // node is not in primary table; is it in fallback table?
+        node = getFallback(nodeIn, hash);
+        if (node != 0) {
+          // it was already in fallback -- promote to primary
+          primaryTable.set(pos, node);
         } else {
-          return false;
+          // not in fallback either -- freeze & add the incoming node
+        
+          // freeze & add
+          node = fst.addNode(fstCompiler, nodeIn);
+
+          // we use 0 as empty marker in hash table, so it better be 
impossible to get a frozen node at 0:
+          assert node != 0;
+
+          // confirm frozen hash and unfrozen hash are the same
+          assert hash(node) == hash : "mismatch frozenHash=" + hash(node) + " 
vs hash=" + hash;

Review Comment:
   Not necessarily related to this CR
   
   This seems to be only for assertion, but `hash(long)` require reading the 
FST and thus one obstacle to decouple the NodeHash & FST. I'm wondering if it 
would make sense to let `fst.addNode` (it should be `fstCompiler.addNode()` 
now) return the node value instead of node address. 
   
   The 2nd and final obstacle is `nodeEquals`. But it seems it would require 
the cache table to store the node value & address instead of just address. 
Maybe we can build a LRU cache from nodeAddress to nodeValue? Storing value 
would mean more heap for small FST though.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org
For additional commands, e-mail: issues-h...@lucene.apache.org

Re: [PR] [WIP] first cut at bounding the NodeHash size during FST compilation [lucene]

Reply via email to