dweiss commented on a change in pull request #1573: URL: https://github.com/apache/lucene-solr/pull/1573#discussion_r439858046
########## File path: lucene/core/src/java/org/apache/lucene/index/TermsHashPerField.java ########## @@ -19,203 +19,207 @@ import java.io.IOException; -import org.apache.lucene.analysis.tokenattributes.TermFrequencyAttribute; -import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute; import org.apache.lucene.util.ByteBlockPool; +import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRefHash.BytesStartArray; import org.apache.lucene.util.BytesRefHash; import org.apache.lucene.util.Counter; import org.apache.lucene.util.IntBlockPool; +/** + * This class stores streams of information per term without knowing + * the size of the stream ahead of time. Each stream typically encodes one level + * of information like term frequency per document or term proximity. Internally + * this class allocates a linked list of slices that can be read by a {@link ByteSliceReader} + * for each term. Terms are first deduplicated in a {@link BytesRefHash} once this is done + * internal data-structures point to the current offset of each stream that can be written to. + */ abstract class TermsHashPerField implements Comparable<TermsHashPerField> { private static final int HASH_INIT_SIZE = 4; - final TermsHash termsHash; - - final TermsHashPerField nextPerField; - protected final DocumentsWriterPerThread.DocState docState; - protected final FieldInvertState fieldState; - TermToBytesRefAttribute termAtt; - protected TermFrequencyAttribute termFreqAtt; - - // Copied from our perThread - final IntBlockPool intPool; + private final TermsHashPerField nextPerField; + private final IntBlockPool intPool; final ByteBlockPool bytePool; - final ByteBlockPool termBytePool; - - final int streamCount; - final int numPostingInt; - - protected final FieldInfo fieldInfo; - - final BytesRefHash bytesHash; + // for each term we store an integer per stream that points into the bytePool above + // the address is updated once data is written to the stream to point to the next free offset + // this the terms stream. The start address for the stream is stored in postingsArray.byteStarts[termId] + // This is initialized in the #addTerm method, either to a brand new per term stream if the term is new or + // to the addresses where the term stream was written to when we saw it the last time. + private int[] termStreamAddressBuffer; + private int streamAddressOffset; + private final int streamCount; + private final String fieldName; + final IndexOptions indexOptions; + /* This stores the actual term bytes for postings and offsets into the parent hash in the case that this + * TermsHashPerField is hashing term vectors.*/ + private final BytesRefHash bytesHash; ParallelPostingsArray postingsArray; - private final Counter bytesUsed; + private int lastDocID; // only with assert /** streamCount: how many streams this field stores per term. * E.g. doc(+freq) is 1 stream, prox+offset is a second. */ - - public TermsHashPerField(int streamCount, FieldInvertState fieldState, TermsHash termsHash, TermsHashPerField nextPerField, FieldInfo fieldInfo) { - intPool = termsHash.intPool; - bytePool = termsHash.bytePool; - termBytePool = termsHash.termBytePool; - docState = termsHash.docState; - this.termsHash = termsHash; - bytesUsed = termsHash.bytesUsed; - this.fieldState = fieldState; + TermsHashPerField(int streamCount, IntBlockPool intPool, ByteBlockPool bytePool, ByteBlockPool termBytePool, + Counter bytesUsed, TermsHashPerField nextPerField, String fieldName, IndexOptions indexOptions) { + this.intPool = intPool; + this.bytePool = bytePool; this.streamCount = streamCount; - numPostingInt = 2*streamCount; - this.fieldInfo = fieldInfo; + this.fieldName = fieldName; this.nextPerField = nextPerField; + assert indexOptions != IndexOptions.NONE; + this.indexOptions = indexOptions; PostingsBytesStartArray byteStarts = new PostingsBytesStartArray(this, bytesUsed); bytesHash = new BytesRefHash(termBytePool, HASH_INIT_SIZE, byteStarts); } void reset() { bytesHash.clear(false); + sortedTermIDs = null; if (nextPerField != null) { nextPerField.reset(); } } - public void initReader(ByteSliceReader reader, int termID, int stream) { + final void initReader(ByteSliceReader reader, int termID, int stream) { assert stream < streamCount; - int intStart = postingsArray.intStarts[termID]; - final int[] ints = intPool.buffers[intStart >> IntBlockPool.INT_BLOCK_SHIFT]; - final int upto = intStart & IntBlockPool.INT_BLOCK_MASK; + int streamStartOffset = postingsArray.addressOffset[termID]; + final int[] streamAddressBuffer = intPool.buffers[streamStartOffset >> IntBlockPool.INT_BLOCK_SHIFT]; + final int offsetInAddressBuffer = streamStartOffset & IntBlockPool.INT_BLOCK_MASK; reader.init(bytePool, postingsArray.byteStarts[termID]+stream*ByteBlockPool.FIRST_LEVEL_SIZE, - ints[upto+stream]); + streamAddressBuffer[offsetInAddressBuffer+stream]); } - int[] sortedTermIDs; + private int[] sortedTermIDs; /** Collapse the hash table and sort in-place; also sets - * this.sortedTermIDs to the results */ - public int[] sortPostings() { + * this.sortedTermIDs to the results + * This method should not be called twice unless {@link #reset()} Review comment: should not -> must not perhaps? ########## File path: lucene/core/src/test/org/apache/lucene/index/TestTermsHashPerField.java ########## @@ -0,0 +1,210 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.index; + +import java.io.IOException; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.TreeMap; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.stream.Collectors; + +import com.carrotsearch.randomizedtesting.generators.RandomPicks; +import com.carrotsearch.randomizedtesting.generators.RandomStrings; +import org.apache.lucene.util.ByteBlockPool; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.Counter; +import org.apache.lucene.util.IntBlockPool; +import org.apache.lucene.util.LuceneTestCase; + +public class TestTermsHashPerField extends LuceneTestCase { + + private static TermsHashPerField createNewHash(AtomicInteger newCalled, AtomicInteger addCalled) { + IntBlockPool intBlockPool = new IntBlockPool(); + ByteBlockPool byteBlockPool = new ByteBlockPool(new ByteBlockPool.DirectAllocator()); + ByteBlockPool termBlockPool = new ByteBlockPool(new ByteBlockPool.DirectAllocator()); + + TermsHashPerField hash = new TermsHashPerField(1, intBlockPool, byteBlockPool, termBlockPool, Counter.newCounter(), + null, "testfield", IndexOptions.DOCS_AND_FREQS) { + + private FreqProxTermsWriterPerField.FreqProxPostingsArray freqProxPostingsArray; + + @Override + void newTerm(int termID, int docID) { + newCalled.incrementAndGet(); + FreqProxTermsWriterPerField.FreqProxPostingsArray postings = freqProxPostingsArray; + postings.lastDocIDs[termID] = docID; + postings.lastDocCodes[termID] = docID << 1; + postings.termFreqs[termID] = 1; + } + + @Override + void addTerm(int termID, int docID) { + addCalled.incrementAndGet(); + FreqProxTermsWriterPerField.FreqProxPostingsArray postings = freqProxPostingsArray; + if (docID != postings.lastDocIDs[termID]) { + if (1 == postings.termFreqs[termID]) { + writeVInt(0, postings.lastDocCodes[termID]|1); + } else { + writeVInt(0, postings.lastDocCodes[termID]); + writeVInt(0, postings.termFreqs[termID]); + } + postings.termFreqs[termID] = 1; + postings.lastDocCodes[termID] = (docID - postings.lastDocIDs[termID]) << 1; + postings.lastDocIDs[termID] = docID; + } else { + postings.termFreqs[termID] = Math.addExact(postings.termFreqs[termID], 1); + } + } + + @Override + void newPostingsArray() { + freqProxPostingsArray = (FreqProxTermsWriterPerField.FreqProxPostingsArray) postingsArray; + Review comment: extra space? ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org