[GitHub] [lucene-solr] mikemccand commented on a change in pull request #1573: Cleanup TermsHashPerField

GitBox Sat, 13 Jun 2020 11:09:33 -0700


mikemccand commented on a change in pull request #1573:
URL: https://github.com/apache/lucene-solr/pull/1573#discussion_r439751347




##########
File path: 
lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java
##########
@@ -207,8 +202,6 @@ public void newPostingsArray() {
 
   @Override
   ParallelPostingsArray createPostingsArray(int size) {
-    IndexOptions indexOptions = fieldInfo.getIndexOptions();
-    assert indexOptions != IndexOptions.NONE;

Review comment:
       Hmm why not keep this assertion (to confirm that if the field is not 
somehow indexed we are not accidentally/incorrectly running this code)?

##########
File path: 
lucene/core/src/java/org/apache/lucene/index/ParallelPostingsArray.java
##########
@@ -22,14 +22,14 @@
   final static int BYTES_PER_POSTING = 3 * Integer.BYTES;
 
   final int size;
-  final int[] textStarts;
-  final int[] intStarts;
-  final int[] byteStarts;
+  final int[] textStarts; // maps term ID to the terms text start in the 
bytesHash
+  final int[] addressOffset; // maps term ID to current stream address

Review comment:
       +1 for this renaming!

##########
File path: lucene/core/src/java/org/apache/lucene/index/TermsHashPerField.java
##########
@@ -19,203 +19,207 @@
 
 import java.io.IOException;
 
-import org.apache.lucene.analysis.tokenattributes.TermFrequencyAttribute;
-import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
 import org.apache.lucene.util.ByteBlockPool;
+import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.BytesRefHash.BytesStartArray;
 import org.apache.lucene.util.BytesRefHash;
 import org.apache.lucene.util.Counter;
 import org.apache.lucene.util.IntBlockPool;
 
+/**
+ * This class allows to store streams of information per term without knowing
+ * the size of the stream ahead of time. Each stream typically encodes one 
level
+ * of information like term frequency per document or term proximity. 
Internally
+ * this class allocates a linked list of slices that can be read by a {@link 
ByteSliceReader}
+ * for each term. Terms are first deduplicated in a {@link BytesRefHash} once 
this is done
+ * internal data-structures point to the current offset of each stream that 
can be written to.
+ */
 abstract class TermsHashPerField implements Comparable<TermsHashPerField> {
   private static final int HASH_INIT_SIZE = 4;
 
-  final TermsHash termsHash;
-
-  final TermsHashPerField nextPerField;
-  protected final DocumentsWriterPerThread.DocState docState;
-  protected final FieldInvertState fieldState;
-  TermToBytesRefAttribute termAtt;
-  protected TermFrequencyAttribute termFreqAtt;
-
-  // Copied from our perThread
-  final IntBlockPool intPool;
+  private final TermsHashPerField nextPerField;
+  private final IntBlockPool intPool;
   final ByteBlockPool bytePool;
-  final ByteBlockPool termBytePool;
-
-  final int streamCount;
-  final int numPostingInt;
-
-  protected final FieldInfo fieldInfo;
-
-  final BytesRefHash bytesHash;
+  // for each term we store an integer per stream that points into the 
bytePool above
+  // the address is updated once data is written to the stream to point to the 
next free offset
+  // this the terms stream. The start address for the stream is stored in 
postingsArray.byteStarts[termId]
+  // This is initialized in the #addTerm method, either to a brand new per 
term stream if the term is new or
+  // to the addresses where the term stream was written to when we saw it the 
last time.
+  private int[] termStreamAddressBuffer;
+  private int streamAddressOffset;
+  private final int streamCount;
+  private final String fieldName;
+  final IndexOptions indexOptions;
+  /* This stores the actual term bytes for postings and offsets into the 
parent hash in the case that this
+  * TermsHashPerField is hashing term vectors.*/
+  private final BytesRefHash bytesHash;
 
   ParallelPostingsArray postingsArray;
-  private final Counter bytesUsed;
+  private int lastDocID; // only with assert
 
   /** streamCount: how many streams this field stores per term.
    * E.g. doc(+freq) is 1 stream, prox+offset is a second. */
-
-  public TermsHashPerField(int streamCount, FieldInvertState fieldState, 
TermsHash termsHash, TermsHashPerField nextPerField, FieldInfo fieldInfo) {
-    intPool = termsHash.intPool;
-    bytePool = termsHash.bytePool;
-    termBytePool = termsHash.termBytePool;
-    docState = termsHash.docState;
-    this.termsHash = termsHash;
-    bytesUsed = termsHash.bytesUsed;
-    this.fieldState = fieldState;
+  TermsHashPerField(int streamCount, IntBlockPool intPool, ByteBlockPool 
bytePool, ByteBlockPool termBytePool,
+                    Counter bytesUsed, TermsHashPerField nextPerField, String 
fieldName, IndexOptions indexOptions) {
+    this.intPool = intPool;
+    this.bytePool = bytePool;
     this.streamCount = streamCount;
-    numPostingInt = 2*streamCount;
-    this.fieldInfo = fieldInfo;
+    this.fieldName = fieldName;
     this.nextPerField = nextPerField;
+    assert indexOptions != IndexOptions.NONE;
+    this.indexOptions = indexOptions;
     PostingsBytesStartArray byteStarts = new PostingsBytesStartArray(this, 
bytesUsed);
     bytesHash = new BytesRefHash(termBytePool, HASH_INIT_SIZE, byteStarts);
   }
 
   void reset() {
     bytesHash.clear(false);
+    sortedTermIDs = null;
     if (nextPerField != null) {
       nextPerField.reset();
     }
   }
 
-  public void initReader(ByteSliceReader reader, int termID, int stream) {
+  final void initReader(ByteSliceReader reader, int termID, int stream) {
     assert stream < streamCount;
-    int intStart = postingsArray.intStarts[termID];
-    final int[] ints = intPool.buffers[intStart >> 
IntBlockPool.INT_BLOCK_SHIFT];
-    final int upto = intStart & IntBlockPool.INT_BLOCK_MASK;
+    int streamStartOffset = postingsArray.addressOffset[termID];
+    final int[] streamAddressBuffer = intPool.buffers[streamStartOffset >> 
IntBlockPool.INT_BLOCK_SHIFT];
+    final int offsetInAddressBuffer = streamStartOffset & 
IntBlockPool.INT_BLOCK_MASK;
     reader.init(bytePool,
                 
postingsArray.byteStarts[termID]+stream*ByteBlockPool.FIRST_LEVEL_SIZE,
-                ints[upto+stream]);
+                streamAddressBuffer[offsetInAddressBuffer+stream]);
   }
 
-  int[] sortedTermIDs;
+  private int[] sortedTermIDs;
 
   /** Collapse the hash table and sort in-place; also sets
-   * this.sortedTermIDs to the results */
-  public int[] sortPostings() {
+   * this.sortedTermIDs to the results
+   * This method should not be called twice unless {@link #reset()}
+   * or {@link #reinitHash()} was called. */
+  final void sortTerms() {
+    assert sortedTermIDs == null;
     sortedTermIDs = bytesHash.sort();
+  }
+
+  /**
+   * Returns the sorted term IDs. {@link #sortTerms()} must be called before
+   */
+  final int[] getSortedTermIDs() {
+    assert sortedTermIDs != null;
     return sortedTermIDs;
   }
 
+  final void reinitHash() {
+    sortedTermIDs = null;
+    bytesHash.reinit();
+  }
+
   private boolean doNextCall;
 
   // Secondary entry point (for 2nd & subsequent TermsHash),
   // because token text has already been "interned" into
   // textStart, so we hash by textStart.  term vectors use
   // this API.
-  public void add(int textStart) throws IOException {
+  private void add(int textStart, final int docID) throws IOException {
     int termID = bytesHash.addByPoolOffset(textStart);
     if (termID >= 0) {      // New posting
       // First time we are seeing this token since we last
       // flushed the hash.
-      // Init stream slices
-      if (numPostingInt + intPool.intUpto > IntBlockPool.INT_BLOCK_SIZE) {
-        intPool.nextBuffer();
-      }
-
-      if (ByteBlockPool.BYTE_BLOCK_SIZE - bytePool.byteUpto < 
numPostingInt*ByteBlockPool.FIRST_LEVEL_SIZE) {
-        bytePool.nextBuffer();
-      }
+      initStreamSlices(termID, docID);
+    } else {
+      positionStreamSlice(termID, docID);
+    }
+  }
 
-      intUptos = intPool.buffer;
-      intUptoStart = intPool.intUpto;
-      intPool.intUpto += streamCount;
+  private void initStreamSlices(int termID, int docID) throws IOException {
+    // Init stream slices
+    // TODO: figure out why this is 2*streamCount here. streamCount should be 
enough?
+    if ((2*streamCount) + intPool.intUpto > IntBlockPool.INT_BLOCK_SIZE) {
+      // can we fit all the streams in the current buffer?
+      intPool.nextBuffer();
+    }
 
-      postingsArray.intStarts[termID] = intUptoStart + intPool.intOffset;
+    if (ByteBlockPool.BYTE_BLOCK_SIZE - bytePool.byteUpto < (2*streamCount) * 
ByteBlockPool.FIRST_LEVEL_SIZE) {
+      // can we fit at least one byte per stream in the current buffer, if not 
allocated a new one

Review comment:
       s/`allocated`/`allocate`

##########
File path: lucene/core/src/java/org/apache/lucene/index/TermsHashPerField.java
##########
@@ -19,203 +19,207 @@
 
 import java.io.IOException;
 
-import org.apache.lucene.analysis.tokenattributes.TermFrequencyAttribute;
-import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
 import org.apache.lucene.util.ByteBlockPool;
+import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.BytesRefHash.BytesStartArray;
 import org.apache.lucene.util.BytesRefHash;
 import org.apache.lucene.util.Counter;
 import org.apache.lucene.util.IntBlockPool;
 
+/**
+ * This class allows to store streams of information per term without knowing
+ * the size of the stream ahead of time. Each stream typically encodes one 
level
+ * of information like term frequency per document or term proximity. 
Internally
+ * this class allocates a linked list of slices that can be read by a {@link 
ByteSliceReader}
+ * for each term. Terms are first deduplicated in a {@link BytesRefHash} once 
this is done
+ * internal data-structures point to the current offset of each stream that 
can be written to.
+ */
 abstract class TermsHashPerField implements Comparable<TermsHashPerField> {
   private static final int HASH_INIT_SIZE = 4;
 
-  final TermsHash termsHash;
-
-  final TermsHashPerField nextPerField;
-  protected final DocumentsWriterPerThread.DocState docState;
-  protected final FieldInvertState fieldState;
-  TermToBytesRefAttribute termAtt;
-  protected TermFrequencyAttribute termFreqAtt;
-
-  // Copied from our perThread
-  final IntBlockPool intPool;
+  private final TermsHashPerField nextPerField;
+  private final IntBlockPool intPool;
   final ByteBlockPool bytePool;
-  final ByteBlockPool termBytePool;
-
-  final int streamCount;
-  final int numPostingInt;
-
-  protected final FieldInfo fieldInfo;
-
-  final BytesRefHash bytesHash;
+  // for each term we store an integer per stream that points into the 
bytePool above
+  // the address is updated once data is written to the stream to point to the 
next free offset
+  // this the terms stream. The start address for the stream is stored in 
postingsArray.byteStarts[termId]
+  // This is initialized in the #addTerm method, either to a brand new per 
term stream if the term is new or
+  // to the addresses where the term stream was written to when we saw it the 
last time.
+  private int[] termStreamAddressBuffer;
+  private int streamAddressOffset;
+  private final int streamCount;
+  private final String fieldName;
+  final IndexOptions indexOptions;
+  /* This stores the actual term bytes for postings and offsets into the 
parent hash in the case that this
+  * TermsHashPerField is hashing term vectors.*/
+  private final BytesRefHash bytesHash;
 
   ParallelPostingsArray postingsArray;
-  private final Counter bytesUsed;
+  private int lastDocID; // only with assert
 
   /** streamCount: how many streams this field stores per term.
    * E.g. doc(+freq) is 1 stream, prox+offset is a second. */
-
-  public TermsHashPerField(int streamCount, FieldInvertState fieldState, 
TermsHash termsHash, TermsHashPerField nextPerField, FieldInfo fieldInfo) {
-    intPool = termsHash.intPool;
-    bytePool = termsHash.bytePool;
-    termBytePool = termsHash.termBytePool;
-    docState = termsHash.docState;
-    this.termsHash = termsHash;
-    bytesUsed = termsHash.bytesUsed;
-    this.fieldState = fieldState;
+  TermsHashPerField(int streamCount, IntBlockPool intPool, ByteBlockPool 
bytePool, ByteBlockPool termBytePool,
+                    Counter bytesUsed, TermsHashPerField nextPerField, String 
fieldName, IndexOptions indexOptions) {
+    this.intPool = intPool;
+    this.bytePool = bytePool;
     this.streamCount = streamCount;
-    numPostingInt = 2*streamCount;
-    this.fieldInfo = fieldInfo;
+    this.fieldName = fieldName;
     this.nextPerField = nextPerField;
+    assert indexOptions != IndexOptions.NONE;

Review comment:
       Ahh I see, we just moved the assertion to a better place, awesome.

##########
File path: 
lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java
##########
@@ -56,12 +56,6 @@ public FreqProxTermsWriterPerField(FieldInvertState 
invertState, TermsHash terms
   @Override
   void finish() throws IOException {
     super.finish();
-    sumDocFreq += fieldState.uniqueTermCount;
-    sumTotalTermFreq += fieldState.length;

Review comment:
       Hmm, did these aggregations move somewhere else?  Oh, they look entirely 
removed?  Were they redundant (computed elsewhere) and these ones were unused?

##########
File path: 
lucene/core/src/java/org/apache/lucene/index/ParallelPostingsArray.java
##########
@@ -22,14 +22,14 @@
   final static int BYTES_PER_POSTING = 3 * Integer.BYTES;
 
   final int size;
-  final int[] textStarts;
-  final int[] intStarts;
-  final int[] byteStarts;
+  final int[] textStarts; // maps term ID to the terms text start in the 
bytesHash

Review comment:
       s/`terms text`/`terms's text`

##########
File path: 
lucene/core/src/java/org/apache/lucene/index/TermVectorsConsumerPerField.java
##########
@@ -222,7 +234,7 @@ void writeProx(TermVectorsPostingsArray postings, int 
termID) {
   }
 
   @Override
-  void newTerm(final int termID) {
+  void newTerm(final int termID, final int docID) {

Review comment:
       Hmm `docID` is unused in this method?  But I guess the other impl 
(normal postings) needs it?

##########
File path: lucene/core/src/java/org/apache/lucene/index/TermsHashPerField.java
##########
@@ -19,203 +19,207 @@
 
 import java.io.IOException;
 
-import org.apache.lucene.analysis.tokenattributes.TermFrequencyAttribute;
-import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
 import org.apache.lucene.util.ByteBlockPool;
+import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.BytesRefHash.BytesStartArray;
 import org.apache.lucene.util.BytesRefHash;
 import org.apache.lucene.util.Counter;
 import org.apache.lucene.util.IntBlockPool;
 
+/**
+ * This class allows to store streams of information per term without knowing

Review comment:
       Thank you for the javadocs/comments for such cryptic and ancient code ;)
   
   Maybe just `stores` instead of `allows to store`?




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
[email protected]



---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[GitHub] [lucene-solr] mikemccand commented on a change in pull request #1573: Cleanup TermsHashPerField

Reply via email to