jpountz commented on a change in pull request #980: LUCENE-8920: Reduce the memory used by direct addressing of arcs URL: https://github.com/apache/lucene-solr/pull/980#discussion_r344391189
########## File path: lucene/core/src/java/org/apache/lucene/util/fst/FST.java ########## @@ -249,38 +261,127 @@ public T nextFinalOutput() { return nextFinalOutput; } - long nextArc() { + /** Address (into the byte[]) of the next arc - only for list of variable length arc. + * Or ord/address to the next node if label == {@link #END_LABEL}. */ + long nextArc() { return nextArc; } - /** Where the first arc in the array starts; only valid if - * bytesPerArc != 0 */ + /** Where we are in the array; only valid if bytesPerArc != 0. */ + public int arcIdx() { + return arcIdx; + } + + /** Node header flags. Only meaningful to check if the value is either + * {@link #ARCS_FOR_BINARY_SEARCH} or {@link #ARCS_FOR_DIRECT_ADDRESSING} + * (other value when bytesPerArc == 0). */ + public byte nodeFlags() { + return nodeFlags; + } + + /** Where the first arc in the array starts; only valid if bytesPerArc != 0 */ public long posArcsStart() { return posArcsStart; } - /** Non-zero if this arc is part of an array, which means all + /** Non-zero if this arc is part of a node with fixed length arcs, which means all * arcs for the node are encoded with a fixed number of bytes so - * that we can random access by index. We do when there are enough - * arcs leaving one node. It wastes some bytes but gives faster - * lookups. */ + * that we binary search or direct address. We do when there are enough + * arcs leaving one node. It wastes some bytes but gives faster lookups. */ public int bytesPerArc() { return bytesPerArc; } - /** Where we are in the array; only valid if bytesPerArc != 0, and the array has no holes. - * arcIdx = Integer.MIN_VALUE indicates that the arc is part of a direct array, addressed by - * label. - */ - public int arcIdx() { - return arcIdx; - } - - /** How many arc, if bytesPerArc == 0. Otherwise, the size of the arc array. If the array is - * direct, this may include holes. Otherwise it is also how many arcs are in the array */ + /** How many arcs; only valid if bytesPerArc != 0 (fixed length arcs). + * For a node designed for binary search this is the array size. + * For a node designed for direct addressing, this is the label range. */ public int numArcs() { return numArcs; } + + /** Table of bits of a direct addressing node. + * Only valid if nodeFlags == {@link #ARCS_FOR_DIRECT_ADDRESSING}; + * may be null otherwise. */ + BitTable bitTable() { + return bitTable; + } + + /** The table of bits of a direct addressing node created lazily. */ + BitTable getOrCreateBitTable() { + if (bitTable == null) { + bitTable = new BitTable(); + } + return bitTable; + } + + /** First label of a direct addressing node. + * Only valid if nodeFlags == {@link #ARCS_FOR_DIRECT_ADDRESSING}. */ + int firstLabel() { + return firstLabel; + } + + /** + * Reusable table of bits using an array of long internally. + */ + static class BitTable { + + private long[] bits; + private int numLongs; + + /** Sets the number of longs in the internal long array. + * Enlarges it if needed. Always clears the array. */ + BitTable setNumLongs(int numLongs) { + assert numLongs >= 0; + this.numLongs = numLongs; + if (bits == null || bits.length < numLongs) { + bits = new long[ArrayUtil.oversize(numLongs, Long.BYTES)]; + } else { + // Avoid range check in Arrays.fill(). Review comment: I don't understand this comment? ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org