mikemccand commented on a change in pull request #1573:
URL: https://github.com/apache/lucene-solr/pull/1573#discussion_r439751347
##########
File path:
lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java
##########
@@ -207,8 +202,6 @@ public void newPostingsArray() {
@Override
ParallelPostingsArray createPostingsArray(int size) {
- IndexOptions indexOptions = fieldInfo.getIndexOptions();
- assert indexOptions != IndexOptions.NONE;
Review comment:
Hmm why not keep this assertion (to confirm that if the field is not
somehow indexed we are not accidentally/incorrectly running this code)?
##########
File path:
lucene/core/src/java/org/apache/lucene/index/ParallelPostingsArray.java
##########
@@ -22,14 +22,14 @@
final static int BYTES_PER_POSTING = 3 * Integer.BYTES;
final int size;
- final int[] textStarts;
- final int[] intStarts;
- final int[] byteStarts;
+ final int[] textStarts; // maps term ID to the terms text start in the
bytesHash
+ final int[] addressOffset; // maps term ID to current stream address
Review comment:
+1 for this renaming!
##########
File path: lucene/core/src/java/org/apache/lucene/index/TermsHashPerField.java
##########
@@ -19,203 +19,207 @@
import java.io.IOException;
-import org.apache.lucene.analysis.tokenattributes.TermFrequencyAttribute;
-import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
import org.apache.lucene.util.ByteBlockPool;
+import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefHash.BytesStartArray;
import org.apache.lucene.util.BytesRefHash;
import org.apache.lucene.util.Counter;
import org.apache.lucene.util.IntBlockPool;
+/**
+ * This class allows to store streams of information per term without knowing
+ * the size of the stream ahead of time. Each stream typically encodes one
level
+ * of information like term frequency per document or term proximity.
Internally
+ * this class allocates a linked list of slices that can be read by a {@link
ByteSliceReader}
+ * for each term. Terms are first deduplicated in a {@link BytesRefHash} once
this is done
+ * internal data-structures point to the current offset of each stream that
can be written to.
+ */
abstract class TermsHashPerField implements Comparable<TermsHashPerField> {
private static final int HASH_INIT_SIZE = 4;
- final TermsHash termsHash;
-
- final TermsHashPerField nextPerField;
- protected final DocumentsWriterPerThread.DocState docState;
- protected final FieldInvertState fieldState;
- TermToBytesRefAttribute termAtt;
- protected TermFrequencyAttribute termFreqAtt;
-
- // Copied from our perThread
- final IntBlockPool intPool;
+ private final TermsHashPerField nextPerField;
+ private final IntBlockPool intPool;
final ByteBlockPool bytePool;
- final ByteBlockPool termBytePool;
-
- final int streamCount;
- final int numPostingInt;
-
- protected final FieldInfo fieldInfo;
-
- final BytesRefHash bytesHash;
+ // for each term we store an integer per stream that points into the
bytePool above
+ // the address is updated once data is written to the stream to point to the
next free offset
+ // this the terms stream. The start address for the stream is stored in
postingsArray.byteStarts[termId]
+ // This is initialized in the #addTerm method, either to a brand new per
term stream if the term is new or
+ // to the addresses where the term stream was written to when we saw it the
last time.
+ private int[] termStreamAddressBuffer;
+ private int streamAddressOffset;
+ private final int streamCount;
+ private final String fieldName;
+ final IndexOptions indexOptions;
+ /* This stores the actual term bytes for postings and offsets into the
parent hash in the case that this
+ * TermsHashPerField is hashing term vectors.*/
+ private final BytesRefHash bytesHash;
ParallelPostingsArray postingsArray;
- private final Counter bytesUsed;
+ private int lastDocID; // only with assert
/** streamCount: how many streams this field stores per term.
* E.g. doc(+freq) is 1 stream, prox+offset is a second. */
-
- public TermsHashPerField(int streamCount, FieldInvertState fieldState,
TermsHash termsHash, TermsHashPerField nextPerField, FieldInfo fieldInfo) {
- intPool = termsHash.intPool;
- bytePool = termsHash.bytePool;
- termBytePool = termsHash.termBytePool;
- docState = termsHash.docState;
- this.termsHash = termsHash;
- bytesUsed = termsHash.bytesUsed;
- this.fieldState = fieldState;
+ TermsHashPerField(int streamCount, IntBlockPool intPool, ByteBlockPool
bytePool, ByteBlockPool termBytePool,
+ Counter bytesUsed, TermsHashPerField nextPerField, String
fieldName, IndexOptions indexOptions) {
+ this.intPool = intPool;
+ this.bytePool = bytePool;
this.streamCount = streamCount;
- numPostingInt = 2*streamCount;
- this.fieldInfo = fieldInfo;
+ this.fieldName = fieldName;
this.nextPerField = nextPerField;
+ assert indexOptions != IndexOptions.NONE;
+ this.indexOptions = indexOptions;
PostingsBytesStartArray byteStarts = new PostingsBytesStartArray(this,
bytesUsed);
bytesHash = new BytesRefHash(termBytePool, HASH_INIT_SIZE, byteStarts);
}
void reset() {
bytesHash.clear(false);
+ sortedTermIDs = null;
if (nextPerField != null) {
nextPerField.reset();
}
}
- public void initReader(ByteSliceReader reader, int termID, int stream) {
+ final void initReader(ByteSliceReader reader, int termID, int stream) {
assert stream < streamCount;
- int intStart = postingsArray.intStarts[termID];
- final int[] ints = intPool.buffers[intStart >>
IntBlockPool.INT_BLOCK_SHIFT];
- final int upto = intStart & IntBlockPool.INT_BLOCK_MASK;
+ int streamStartOffset = postingsArray.addressOffset[termID];
+ final int[] streamAddressBuffer = intPool.buffers[streamStartOffset >>
IntBlockPool.INT_BLOCK_SHIFT];
+ final int offsetInAddressBuffer = streamStartOffset &
IntBlockPool.INT_BLOCK_MASK;
reader.init(bytePool,
postingsArray.byteStarts[termID]+stream*ByteBlockPool.FIRST_LEVEL_SIZE,
- ints[upto+stream]);
+ streamAddressBuffer[offsetInAddressBuffer+stream]);
}
- int[] sortedTermIDs;
+ private int[] sortedTermIDs;
/** Collapse the hash table and sort in-place; also sets
- * this.sortedTermIDs to the results */
- public int[] sortPostings() {
+ * this.sortedTermIDs to the results
+ * This method should not be called twice unless {@link #reset()}
+ * or {@link #reinitHash()} was called. */
+ final void sortTerms() {
+ assert sortedTermIDs == null;
sortedTermIDs = bytesHash.sort();
+ }
+
+ /**
+ * Returns the sorted term IDs. {@link #sortTerms()} must be called before
+ */
+ final int[] getSortedTermIDs() {
+ assert sortedTermIDs != null;
return sortedTermIDs;
}
+ final void reinitHash() {
+ sortedTermIDs = null;
+ bytesHash.reinit();
+ }
+
private boolean doNextCall;
// Secondary entry point (for 2nd & subsequent TermsHash),
// because token text has already been "interned" into
// textStart, so we hash by textStart. term vectors use
// this API.
- public void add(int textStart) throws IOException {
+ private void add(int textStart, final int docID) throws IOException {
int termID = bytesHash.addByPoolOffset(textStart);
if (termID >= 0) { // New posting
// First time we are seeing this token since we last
// flushed the hash.
- // Init stream slices
- if (numPostingInt + intPool.intUpto > IntBlockPool.INT_BLOCK_SIZE) {
- intPool.nextBuffer();
- }
-
- if (ByteBlockPool.BYTE_BLOCK_SIZE - bytePool.byteUpto <
numPostingInt*ByteBlockPool.FIRST_LEVEL_SIZE) {
- bytePool.nextBuffer();
- }
+ initStreamSlices(termID, docID);
+ } else {
+ positionStreamSlice(termID, docID);
+ }
+ }
- intUptos = intPool.buffer;
- intUptoStart = intPool.intUpto;
- intPool.intUpto += streamCount;
+ private void initStreamSlices(int termID, int docID) throws IOException {
+ // Init stream slices
+ // TODO: figure out why this is 2*streamCount here. streamCount should be
enough?
+ if ((2*streamCount) + intPool.intUpto > IntBlockPool.INT_BLOCK_SIZE) {
+ // can we fit all the streams in the current buffer?
+ intPool.nextBuffer();
+ }
- postingsArray.intStarts[termID] = intUptoStart + intPool.intOffset;
+ if (ByteBlockPool.BYTE_BLOCK_SIZE - bytePool.byteUpto < (2*streamCount) *
ByteBlockPool.FIRST_LEVEL_SIZE) {
+ // can we fit at least one byte per stream in the current buffer, if not
allocated a new one
Review comment:
s/`allocated`/`allocate`
##########
File path: lucene/core/src/java/org/apache/lucene/index/TermsHashPerField.java
##########
@@ -19,203 +19,207 @@
import java.io.IOException;
-import org.apache.lucene.analysis.tokenattributes.TermFrequencyAttribute;
-import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
import org.apache.lucene.util.ByteBlockPool;
+import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefHash.BytesStartArray;
import org.apache.lucene.util.BytesRefHash;
import org.apache.lucene.util.Counter;
import org.apache.lucene.util.IntBlockPool;
+/**
+ * This class allows to store streams of information per term without knowing
+ * the size of the stream ahead of time. Each stream typically encodes one
level
+ * of information like term frequency per document or term proximity.
Internally
+ * this class allocates a linked list of slices that can be read by a {@link
ByteSliceReader}
+ * for each term. Terms are first deduplicated in a {@link BytesRefHash} once
this is done
+ * internal data-structures point to the current offset of each stream that
can be written to.
+ */
abstract class TermsHashPerField implements Comparable<TermsHashPerField> {
private static final int HASH_INIT_SIZE = 4;
- final TermsHash termsHash;
-
- final TermsHashPerField nextPerField;
- protected final DocumentsWriterPerThread.DocState docState;
- protected final FieldInvertState fieldState;
- TermToBytesRefAttribute termAtt;
- protected TermFrequencyAttribute termFreqAtt;
-
- // Copied from our perThread
- final IntBlockPool intPool;
+ private final TermsHashPerField nextPerField;
+ private final IntBlockPool intPool;
final ByteBlockPool bytePool;
- final ByteBlockPool termBytePool;
-
- final int streamCount;
- final int numPostingInt;
-
- protected final FieldInfo fieldInfo;
-
- final BytesRefHash bytesHash;
+ // for each term we store an integer per stream that points into the
bytePool above
+ // the address is updated once data is written to the stream to point to the
next free offset
+ // this the terms stream. The start address for the stream is stored in
postingsArray.byteStarts[termId]
+ // This is initialized in the #addTerm method, either to a brand new per
term stream if the term is new or
+ // to the addresses where the term stream was written to when we saw it the
last time.
+ private int[] termStreamAddressBuffer;
+ private int streamAddressOffset;
+ private final int streamCount;
+ private final String fieldName;
+ final IndexOptions indexOptions;
+ /* This stores the actual term bytes for postings and offsets into the
parent hash in the case that this
+ * TermsHashPerField is hashing term vectors.*/
+ private final BytesRefHash bytesHash;
ParallelPostingsArray postingsArray;
- private final Counter bytesUsed;
+ private int lastDocID; // only with assert
/** streamCount: how many streams this field stores per term.
* E.g. doc(+freq) is 1 stream, prox+offset is a second. */
-
- public TermsHashPerField(int streamCount, FieldInvertState fieldState,
TermsHash termsHash, TermsHashPerField nextPerField, FieldInfo fieldInfo) {
- intPool = termsHash.intPool;
- bytePool = termsHash.bytePool;
- termBytePool = termsHash.termBytePool;
- docState = termsHash.docState;
- this.termsHash = termsHash;
- bytesUsed = termsHash.bytesUsed;
- this.fieldState = fieldState;
+ TermsHashPerField(int streamCount, IntBlockPool intPool, ByteBlockPool
bytePool, ByteBlockPool termBytePool,
+ Counter bytesUsed, TermsHashPerField nextPerField, String
fieldName, IndexOptions indexOptions) {
+ this.intPool = intPool;
+ this.bytePool = bytePool;
this.streamCount = streamCount;
- numPostingInt = 2*streamCount;
- this.fieldInfo = fieldInfo;
+ this.fieldName = fieldName;
this.nextPerField = nextPerField;
+ assert indexOptions != IndexOptions.NONE;
Review comment:
Ahh I see, we just moved the assertion to a better place, awesome.
##########
File path:
lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java
##########
@@ -56,12 +56,6 @@ public FreqProxTermsWriterPerField(FieldInvertState
invertState, TermsHash terms
@Override
void finish() throws IOException {
super.finish();
- sumDocFreq += fieldState.uniqueTermCount;
- sumTotalTermFreq += fieldState.length;
Review comment:
Hmm, did these aggregations move somewhere else? Oh, they look entirely
removed? Were they redundant (computed elsewhere) and these ones were unused?
##########
File path:
lucene/core/src/java/org/apache/lucene/index/ParallelPostingsArray.java
##########
@@ -22,14 +22,14 @@
final static int BYTES_PER_POSTING = 3 * Integer.BYTES;
final int size;
- final int[] textStarts;
- final int[] intStarts;
- final int[] byteStarts;
+ final int[] textStarts; // maps term ID to the terms text start in the
bytesHash
Review comment:
s/`terms text`/`terms's text`
##########
File path:
lucene/core/src/java/org/apache/lucene/index/TermVectorsConsumerPerField.java
##########
@@ -222,7 +234,7 @@ void writeProx(TermVectorsPostingsArray postings, int
termID) {
}
@Override
- void newTerm(final int termID) {
+ void newTerm(final int termID, final int docID) {
Review comment:
Hmm `docID` is unused in this method? But I guess the other impl
(normal postings) needs it?
##########
File path: lucene/core/src/java/org/apache/lucene/index/TermsHashPerField.java
##########
@@ -19,203 +19,207 @@
import java.io.IOException;
-import org.apache.lucene.analysis.tokenattributes.TermFrequencyAttribute;
-import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
import org.apache.lucene.util.ByteBlockPool;
+import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefHash.BytesStartArray;
import org.apache.lucene.util.BytesRefHash;
import org.apache.lucene.util.Counter;
import org.apache.lucene.util.IntBlockPool;
+/**
+ * This class allows to store streams of information per term without knowing
Review comment:
Thank you for the javadocs/comments for such cryptic and ancient code ;)
Maybe just `stores` instead of `allows to store`?
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]