somandal commented on code in PR #9810:
URL: https://github.com/apache/pinot/pull/9810#discussion_r1039974276


##########
pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/loader/InvertedIndexAndDictionaryBasedForwardIndexCreator.java:
##########
@@ -0,0 +1,619 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.pinot.segment.local.segment.index.loader;
+
+import com.google.common.base.Preconditions;
+import java.io.File;
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.Map;
+import org.apache.commons.io.FileUtils;
+import 
org.apache.pinot.segment.local.segment.index.readers.BitmapInvertedIndexReader;
+import org.apache.pinot.segment.spi.ColumnMetadata;
+import org.apache.pinot.segment.spi.SegmentMetadata;
+import org.apache.pinot.segment.spi.V1Constants;
+import org.apache.pinot.segment.spi.compression.ChunkCompressionType;
+import org.apache.pinot.segment.spi.creator.IndexCreationContext;
+import org.apache.pinot.segment.spi.creator.IndexCreatorProvider;
+import org.apache.pinot.segment.spi.index.creator.ForwardIndexCreator;
+import org.apache.pinot.segment.spi.index.reader.Dictionary;
+import org.apache.pinot.segment.spi.memory.PinotDataBuffer;
+import org.apache.pinot.segment.spi.store.ColumnIndexType;
+import org.apache.pinot.segment.spi.store.SegmentDirectory;
+import org.apache.pinot.spi.data.FieldSpec;
+import org.apache.pinot.spi.utils.BigDecimalUtils;
+import org.apache.pinot.spi.utils.ByteArray;
+import org.roaringbitmap.buffer.ImmutableRoaringBitmap;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import static 
org.apache.pinot.segment.spi.V1Constants.MetadataKeys.Column.DICTIONARY_ELEMENT_SIZE;
+import static 
org.apache.pinot.segment.spi.V1Constants.MetadataKeys.Column.HAS_DICTIONARY;
+import static 
org.apache.pinot.segment.spi.V1Constants.MetadataKeys.Column.MAX_MULTI_VALUE_ELEMENTS;
+import static 
org.apache.pinot.segment.spi.V1Constants.MetadataKeys.Column.TOTAL_NUMBER_OF_ENTRIES;
+import static 
org.apache.pinot.segment.spi.V1Constants.MetadataKeys.Column.getKeyFor;
+
+
+/**
+ * Helper classed used by the {@link SegmentPreProcessor} to generate the 
forward index from inverted index and
+ * dictionary when the forward index is enabled for columns where it was 
previously disabled. This is also invoked by
+ * the {@link IndexHandler} code in scenarios where the forward index needs to 
be temporarily created to generate other
+ * indexes for the given column. In such cases the forward index will be 
cleaned up after the {@link IndexHandler} code
+ * completes.
+ *
+ * For multi-value columns the following invariants cannot be maintained:
+ * - Ordering of elements within a given multi-value row. This will always be 
a limitation.
+ * - Data loss is possible if there repeats for elements within a given 
multi-value row. This limitation will be
+ *   addressed as a future change
+ *
+ * TODO: Currently for multi-value columns generating the forward index can 
lead to a data loss as frequency information
+ *       is not available for repeats within a given row. This needs to be 
addressed by tracking the frequency data
+ *       as part of an on-disk structure when forward index is disabled for a 
column.
+ */
+public class InvertedIndexAndDictionaryBasedForwardIndexCreator implements 
AutoCloseable {
+  private static final Logger LOGGER =
+      
LoggerFactory.getLogger(InvertedIndexAndDictionaryBasedForwardIndexCreator.class);
+
+  // Use MMapBuffer if the value buffer size is larger than 2G
+  private static final int NUM_VALUES_THRESHOLD_FOR_MMAP_BUFFER = 500_000_000;
+
+  private static final String FORWARD_INDEX_VALUE_BUFFER_SUFFIX = 
".fwd.idx.val.buf";
+  private static final String FORWARD_INDEX_LENGTH_BUFFER_SUFFIX = 
".fwd.idx.len.buf";
+  private static final String FORWARD_INDEX_MAX_SIZE_BUFFER_SUFFIX = 
".fwd.idx.maxsize.buf";
+
+  private final String _columnName;
+  private final SegmentMetadata _segmentMetadata;
+  private final IndexLoadingConfig _indexLoadingConfig;
+  private final SegmentDirectory.Writer _segmentWriter;
+  private final IndexCreatorProvider _indexCreatorProvider;
+  private final boolean _isTemporaryForwardIndex;
+
+  // Metadata
+  private final ColumnMetadata _columnMetadata;
+  private final boolean _singleValue;
+  private final int _cardinality;
+  private final int _numDocs;
+  private final int _maxNumberOfMultiValues;
+  private final int _totalNumberOfEntries;
+  private final boolean _dictionaryEnabled;
+  private final ChunkCompressionType _chunkCompressionType;
+  private final boolean _useMMapBuffer;
+
+  // Files and temporary buffers
+  private final File _forwardIndexFile;
+  private final File _forwardIndexValueBufferFile;
+  private final File _forwardIndexLengthBufferFile;
+  private final File _forwardIndexMaxSizeBufferFile;
+
+  // Forward index buffers (to store the dictId at the correct docId)
+  private PinotDataBuffer _forwardIndexValueBuffer;
+  // For multi-valued column only because each docId can have multiple dictIds
+  private PinotDataBuffer _forwardIndexLengthBuffer;
+  private int _nextValueId;
+  // For multi-valued column only to track max row size
+  private PinotDataBuffer _forwardIndexMaxSizeBuffer;
+
+  public InvertedIndexAndDictionaryBasedForwardIndexCreator(String columnName, 
SegmentMetadata segmentMetadata,
+      IndexLoadingConfig indexLoadingConfig, SegmentDirectory.Writer 
segmentWriter,

Review Comment:
   doubled checked, can't find anything wrong 



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@pinot.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@pinot.apache.org
For additional commands, e-mail: commits-h...@pinot.apache.org

Reply via email to