klsince commented on code in PR #11993: URL: https://github.com/apache/pinot/pull/11993#discussion_r1393458717
########## pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/writer/impl/FixedBitDedupMVForwardIndexWriter.java: ########## @@ -0,0 +1,124 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.segment.local.io.writer.impl; + +import it.unimi.dsi.fastutil.ints.IntArrayList; +import it.unimi.dsi.fastutil.objects.Object2IntMap; +import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap; +import java.io.Closeable; +import java.io.File; +import java.io.IOException; +import java.nio.ByteOrder; +import org.apache.pinot.segment.local.io.util.FixedBitIntReaderWriter; +import org.apache.pinot.segment.local.io.util.PinotDataBitSet; +import org.apache.pinot.segment.spi.memory.PinotDataBuffer; + + +/** + * Bit-compressed dictionary-encoded forward index writer for multi-value columns, where a map from doc id to + * multi-value id and unique multi-values are stored. + * + * Index layout: + * - Index header (24 bytes) + * - ID buffer (stores the multi-value id for each doc id) + * - Offset buffer (stores the start offset of each multi-value, followed by end offset of the last value) + * - Value buffer (stores the actual values) + * + * Header layout: + * - Magic marker (4 bytes) + * - Version (2 bytes) + * - Bits per value (1 byte) + * - Bits per id (1 byte) + * - Number of unique values (4 bytes) + * - Number of total entries (4 bytes) + * - Start offset of offset buffer (4 bytes) + * - Start offset of value buffer (4 bytes) + */ +public class FixedBitDedupMVForwardIndexWriter implements Closeable { + public static final int MAGIC_MARKER = 0xdedffded; + public static final short VERSION = 1; + + private final Object2IntOpenHashMap<IntArrayList> _valueToIdMap = new Object2IntOpenHashMap<>(); + private final File _file; + private final int _numBitsPerValue; + private final IntArrayList _ids; + + public FixedBitDedupMVForwardIndexWriter(File file, int numDocs, int numBitsPerValue) { + _file = file; + _numBitsPerValue = numBitsPerValue; + _ids = new IntArrayList(numDocs); + } + + public void putDictIds(int[] dictIds) { + _ids.add(_valueToIdMap.computeIntIfAbsent(IntArrayList.wrap(dictIds), k -> _valueToIdMap.size())); Review Comment: perhaps add some comments for this method, as it's the key to translating an entire MV entry to a unique id and doing the dedup part (👍 for this neat implementation) ########## pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/index/reader/Dictionary.java: ########## @@ -208,24 +208,48 @@ default void readIntValues(int[] dictIds, int length, int[] outValues) { } } + default void readIntValues(int[] dictIds, int length, Integer[] outValues) { Review Comment: are those new methods to allow null values? ########## pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/writer/impl/FixedBitDedupMVForwardIndexWriter.java: ########## @@ -0,0 +1,124 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.segment.local.io.writer.impl; + +import it.unimi.dsi.fastutil.ints.IntArrayList; +import it.unimi.dsi.fastutil.objects.Object2IntMap; +import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap; +import java.io.Closeable; +import java.io.File; +import java.io.IOException; +import java.nio.ByteOrder; +import org.apache.pinot.segment.local.io.util.FixedBitIntReaderWriter; +import org.apache.pinot.segment.local.io.util.PinotDataBitSet; +import org.apache.pinot.segment.spi.memory.PinotDataBuffer; + + +/** + * Bit-compressed dictionary-encoded forward index writer for multi-value columns, where a map from doc id to + * multi-value id and unique multi-values are stored. + * + * Index layout: + * - Index header (24 bytes) + * - ID buffer (stores the multi-value id for each doc id) + * - Offset buffer (stores the start offset of each multi-value, followed by end offset of the last value) + * - Value buffer (stores the actual values) + * + * Header layout: + * - Magic marker (4 bytes) + * - Version (2 bytes) + * - Bits per value (1 byte) + * - Bits per id (1 byte) + * - Number of unique values (4 bytes) + * - Number of total entries (4 bytes) + * - Start offset of offset buffer (4 bytes) + * - Start offset of value buffer (4 bytes) + */ +public class FixedBitDedupMVForwardIndexWriter implements Closeable { + public static final int MAGIC_MARKER = 0xdedffded; + public static final short VERSION = 1; + + private final Object2IntOpenHashMap<IntArrayList> _valueToIdMap = new Object2IntOpenHashMap<>(); + private final File _file; + private final int _numBitsPerValue; + private final IntArrayList _ids; + + public FixedBitDedupMVForwardIndexWriter(File file, int numDocs, int numBitsPerValue) { + _file = file; + _numBitsPerValue = numBitsPerValue; + _ids = new IntArrayList(numDocs); + } + + public void putDictIds(int[] dictIds) { + _ids.add(_valueToIdMap.computeIntIfAbsent(IntArrayList.wrap(dictIds), k -> _valueToIdMap.size())); Review Comment: perhaps add some comments for this method, as it's the key to translating an entire MV entry to a unique id and doing the dedup part (👍 for this neat implementation) -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@pinot.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@pinot.apache.org For additional commands, e-mail: commits-h...@pinot.apache.org