This is an automated email from the ASF dual-hosted git repository. jackie pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/pinot.git
The following commit(s) were added to refs/heads/master by this push: new 2d43123026 [Clean up] Remove the support for non-zero padding (#10087) 2d43123026 is described below commit 2d43123026388600ba7d3e57c159d3139da919c6 Author: Xiaotian (Jackie) Jiang <17555551+jackie-ji...@users.noreply.github.com> AuthorDate: Wed Jan 18 15:59:05 2023 -0800 [Clean up] Remove the support for non-zero padding (#10087) --- .../perf/BenchmarkFixedByteValueReaderWriter.java | 11 ++- .../pinot/perf/BenchmarkOfflineIndexReader.java | 2 +- .../local/io/util/FixedByteValueReaderWriter.java | 40 +++++------ .../pinot/segment/local/io/util/ValueReader.java | 13 ++-- .../local/io/util/VarLengthValueReader.java | 4 +- .../index/column/PhysicalColumnIndexContainer.java | 10 +-- .../loader/bloomfilter/BloomFilterHandler.java | 3 +- .../ColumnMinMaxValueGenerator.java | 32 ++++----- .../index/readers/BaseImmutableDictionary.java | 72 ++++--------------- .../index/readers/BigDecimalDictionary.java | 2 +- .../segment/index/readers/BytesDictionary.java | 2 +- .../segment/index/readers/DoubleDictionary.java | 2 +- .../segment/index/readers/FloatDictionary.java | 2 +- .../local/segment/index/readers/IntDictionary.java | 2 +- .../segment/index/readers/LongDictionary.java | 2 +- .../index/readers/OnHeapBigDecimalDictionary.java | 2 +- .../index/readers/OnHeapBytesDictionary.java | 2 +- .../index/readers/OnHeapDoubleDictionary.java | 2 +- .../index/readers/OnHeapFloatDictionary.java | 2 +- .../segment/index/readers/OnHeapIntDictionary.java | 2 +- .../index/readers/OnHeapLongDictionary.java | 2 +- .../index/readers/OnHeapStringDictionary.java | 30 ++------ .../segment/index/readers/StringDictionary.java | 4 +- .../readers/json/ImmutableJsonIndexReader.java | 2 +- .../io/util/VarLengthValueReaderWriterTest.java | 4 +- .../local/segment/index/loader/LoaderTest.java | 79 --------------------- .../index/readers/ImmutableDictionaryTest.java | 6 +- .../ImmutableDictionaryTypeConversionTest.java | 6 +- .../FixedByteValueReaderWriterTest.java | 5 +- .../readerwriter/ValueReaderComparisonTest.java | 8 +-- .../src/test/resources/data/paddingNull.tar.gz | Bin 1170 -> 0 bytes .../src/test/resources/data/paddingOld.tar.gz | Bin 1157 -> 0 bytes .../src/test/resources/data/paddingPercent.tar.gz | Bin 1174 -> 0 bytes .../apache/pinot/segment/spi/ColumnMetadata.java | 2 - .../org/apache/pinot/segment/spi/V1Constants.java | 1 - .../spi/index/metadata/ColumnMetadataImpl.java | 58 ++++++--------- 36 files changed, 114 insertions(+), 302 deletions(-) diff --git a/pinot-perf/src/main/java/org/apache/pinot/perf/BenchmarkFixedByteValueReaderWriter.java b/pinot-perf/src/main/java/org/apache/pinot/perf/BenchmarkFixedByteValueReaderWriter.java index 7f7d6d0865..c0f95e6c0d 100644 --- a/pinot-perf/src/main/java/org/apache/pinot/perf/BenchmarkFixedByteValueReaderWriter.java +++ b/pinot-perf/src/main/java/org/apache/pinot/perf/BenchmarkFixedByteValueReaderWriter.java @@ -20,6 +20,7 @@ package org.apache.pinot.perf; import java.io.IOException; import java.nio.ByteOrder; +import java.util.Arrays; import java.util.SplittableRandom; import org.apache.pinot.segment.local.io.util.FixedByteValueReaderWriter; import org.apache.pinot.segment.spi.memory.PinotDataBuffer; @@ -45,9 +46,6 @@ public class BenchmarkFixedByteValueReaderWriter { @Param({"true", "false"}) boolean _nativeOrder; - @Param("42") - int _paddingByte; - private PinotDataBuffer _dataBuffer; private FixedByteValueReaderWriter _writer; byte[] _buffer; @@ -59,11 +57,12 @@ public class BenchmarkFixedByteValueReaderWriter { _writer = new FixedByteValueReaderWriter(_dataBuffer); SplittableRandom random = new SplittableRandom(42); byte[] bytes = new byte[_length]; + Arrays.fill(bytes, (byte) 1); for (int i = 0; i < _values; i++) { int index = random.nextInt(bytes.length); - bytes[index] = (byte) _paddingByte; - _writer.writeBytes(i, _length, bytes); bytes[index] = 0; + _writer.writeBytes(i, _length, bytes); + bytes[index] = 1; } _buffer = bytes; } @@ -77,7 +76,7 @@ public class BenchmarkFixedByteValueReaderWriter { @Benchmark public void readStrings(Blackhole bh) { for (int i = 0; i < _values; i++) { - bh.consume(_writer.getUnpaddedString(i, _length, (byte) _paddingByte, _buffer)); + bh.consume(_writer.getUnpaddedString(i, _length, _buffer)); } } } diff --git a/pinot-perf/src/main/java/org/apache/pinot/perf/BenchmarkOfflineIndexReader.java b/pinot-perf/src/main/java/org/apache/pinot/perf/BenchmarkOfflineIndexReader.java index 28842d5d9d..232e1e0bd5 100644 --- a/pinot-perf/src/main/java/org/apache/pinot/perf/BenchmarkOfflineIndexReader.java +++ b/pinot-perf/src/main/java/org/apache/pinot/perf/BenchmarkOfflineIndexReader.java @@ -152,7 +152,7 @@ public class BenchmarkOfflineIndexReader { segmentMetadata.getColumnMetadataFor(DOUBLE_COLUMN_NAME).getCardinality()); ColumnMetadata stringColumnMetadata = segmentMetadata.getColumnMetadataFor(STRING_COLUMN_NAME); _stringDictionary = new StringDictionary(segmentReader.getIndexFor(STRING_COLUMN_NAME, ColumnIndexType.DICTIONARY), - stringColumnMetadata.getCardinality(), stringColumnMetadata.getColumnMaxLength(), (byte) 0); + stringColumnMetadata.getCardinality(), stringColumnMetadata.getColumnMaxLength()); } @Benchmark diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/util/FixedByteValueReaderWriter.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/util/FixedByteValueReaderWriter.java index 80d7820c80..f4978bd458 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/util/FixedByteValueReaderWriter.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/util/FixedByteValueReaderWriter.java @@ -56,52 +56,46 @@ public final class FixedByteValueReaderWriter implements ValueReader { /** * Reads the unpadded bytes into the given buffer and returns the length. */ - private int readUnpaddedBytes(int index, int numBytesPerValue, byte paddingByte, byte[] buffer) { + private int readUnpaddedBytes(int index, int numBytesPerValue, byte[] buffer) { // Based on the ZeroInWord algorithm: http://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord assert buffer.length >= numBytesPerValue; long startOffset = (long) index * numBytesPerValue; - long pattern = (paddingByte & 0xFFL) * 0x101010101010101L; + boolean littleEndian = _dataBuffer.order() == ByteOrder.LITTLE_ENDIAN; ByteBuffer wrapper = ByteBuffer.wrap(buffer); - if (_dataBuffer.order() == ByteOrder.LITTLE_ENDIAN) { + if (littleEndian) { wrapper.order(ByteOrder.LITTLE_ENDIAN); } - int position = 0; int endIndex = numBytesPerValue & 0xFFFFFFF8; - for (int i = 0; i < endIndex; i += Long.BYTES) { + int i = 0; + for (; i < endIndex; i += Long.BYTES) { long word = _dataBuffer.getLong(startOffset + i); wrapper.putLong(i, word); - long zeroed = word ^ pattern; - long tmp = (zeroed & 0x7F7F7F7F7F7F7F7FL) + 0x7F7F7F7F7F7F7F7FL; - tmp = ~(tmp | zeroed | 0x7F7F7F7F7F7F7F7FL); - if (tmp == 0) { - position += 8; - } else { - position += _dataBuffer.order() == ByteOrder.LITTLE_ENDIAN ? Long.numberOfTrailingZeros(tmp) >>> 3 - : Long.numberOfLeadingZeros(tmp) >>> 3; - return position; + long tmp = ~(((word & 0x7F7F7F7F7F7F7F7FL) + 0x7F7F7F7F7F7F7F7FL) | word | 0x7F7F7F7F7F7F7F7FL); + if (tmp != 0) { + return i + ((littleEndian ? Long.numberOfTrailingZeros(tmp) : Long.numberOfLeadingZeros(tmp)) >>> 3); } } - for (; position < numBytesPerValue; position++) { - byte b = _dataBuffer.getByte(startOffset + position); - if (b == paddingByte) { + for (; i < numBytesPerValue; i++) { + byte b = _dataBuffer.getByte(startOffset + i); + if (b == 0) { break; } - buffer[position] = b; + buffer[i] = b; } - return position; + return i; } @Override - public byte[] getUnpaddedBytes(int index, int numBytesPerValue, byte paddingByte, byte[] buffer) { - int length = readUnpaddedBytes(index, numBytesPerValue, paddingByte, buffer); + public byte[] getUnpaddedBytes(int index, int numBytesPerValue, byte[] buffer) { + int length = readUnpaddedBytes(index, numBytesPerValue, buffer); byte[] bytes = new byte[length]; System.arraycopy(buffer, 0, bytes, 0, length); return bytes; } @Override - public String getUnpaddedString(int index, int numBytesPerValue, byte paddingByte, byte[] buffer) { - int length = readUnpaddedBytes(index, numBytesPerValue, paddingByte, buffer); + public String getUnpaddedString(int index, int numBytesPerValue, byte[] buffer) { + int length = readUnpaddedBytes(index, numBytesPerValue, buffer); return new String(buffer, 0, length, UTF_8); } diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/util/ValueReader.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/util/ValueReader.java index 225115bd5f..9f582abe95 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/util/ValueReader.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/util/ValueReader.java @@ -41,20 +41,15 @@ public interface ValueReader extends Closeable { } /** - * Get un-padded bytes for string. + * Returns un-padded bytes for string. * NOTE: The passed in reusable buffer should have capacity of at least {@code numBytesPerValue}. - * @param index - * @param numBytesPerValue - * @param paddingByte - * @param buffer - * @return */ - byte[] getUnpaddedBytes(int index, int numBytesPerValue, byte paddingByte, byte[] buffer); + byte[] getUnpaddedBytes(int index, int numBytesPerValue, byte[] buffer); /** * NOTE: The passed in reusable buffer should have capacity of at least {@code numBytesPerValue}. */ - String getUnpaddedString(int index, int numBytesPerValue, byte paddingByte, byte[] buffer); + String getUnpaddedString(int index, int numBytesPerValue, byte[] buffer); /** * NOTE: The passed in reusable buffer should have capacity of at least {@code numBytesPerValue}. @@ -67,7 +62,7 @@ public interface ValueReader extends Closeable { byte[] getBytes(int index, int numBytesPerValue); /** - * Assumes a zero padding byte, caller responsible for checking before use. + * Returns the comparison result of the UTF-8 decoded values. */ int compareUtf8Bytes(int index, int numBytesPerValue, byte[] bytes); } diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/util/VarLengthValueReader.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/util/VarLengthValueReader.java index 891bbe4011..596cb3125d 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/util/VarLengthValueReader.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/util/VarLengthValueReader.java @@ -82,12 +82,12 @@ public class VarLengthValueReader implements ValueReader { } @Override - public byte[] getUnpaddedBytes(int index, int numBytesPerValue, byte paddingByte, byte[] buffer) { + public byte[] getUnpaddedBytes(int index, int numBytesPerValue, byte[] buffer) { return getBytes(index, numBytesPerValue); } @Override - public String getUnpaddedString(int index, int numBytesPerValue, byte paddingByte, byte[] buffer) { + public String getUnpaddedString(int index, int numBytesPerValue, byte[] buffer) { assert buffer.length >= numBytesPerValue; // Read the offset of the byte array first and then read the actual byte array. diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/column/PhysicalColumnIndexContainer.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/column/PhysicalColumnIndexContainer.java index 3a32aa8ada..01cd6f35d7 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/column/PhysicalColumnIndexContainer.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/column/PhysicalColumnIndexContainer.java @@ -134,8 +134,9 @@ public final class PhysicalColumnIndexContainer implements ColumnIndexContainer } // Setting the 'fwdIndexBuffer' to null if forward index is disabled - PinotDataBuffer fwdIndexBuffer = segmentReader.hasIndexFor(columnName, ColumnIndexType.FORWARD_INDEX) - ? segmentReader.getIndexFor(columnName, ColumnIndexType.FORWARD_INDEX) : null; + PinotDataBuffer fwdIndexBuffer = + segmentReader.hasIndexFor(columnName, ColumnIndexType.FORWARD_INDEX) ? segmentReader.getIndexFor(columnName, + ColumnIndexType.FORWARD_INDEX) : null; if (metadata.hasDictionary()) { // Dictionary-based index @@ -260,9 +261,8 @@ public final class PhysicalColumnIndexContainer implements ColumnIndexContainer : new BigDecimalDictionary(dictionaryBuffer, length, numBytesPerValue); case STRING: numBytesPerValue = metadata.getColumnMaxLength(); - byte paddingByte = (byte) metadata.getPaddingCharacter(); - return loadOnHeap ? new OnHeapStringDictionary(dictionaryBuffer, length, numBytesPerValue, paddingByte) - : new StringDictionary(dictionaryBuffer, length, numBytesPerValue, paddingByte); + return loadOnHeap ? new OnHeapStringDictionary(dictionaryBuffer, length, numBytesPerValue) + : new StringDictionary(dictionaryBuffer, length, numBytesPerValue); case BYTES: numBytesPerValue = metadata.getColumnMaxLength(); return loadOnHeap ? new OnHeapBytesDictionary(dictionaryBuffer, length, numBytesPerValue) diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/loader/bloomfilter/BloomFilterHandler.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/loader/bloomfilter/BloomFilterHandler.java index 4693b3d4dd..109a945f30 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/loader/bloomfilter/BloomFilterHandler.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/loader/bloomfilter/BloomFilterHandler.java @@ -306,8 +306,7 @@ public class BloomFilterHandler extends BaseIndexHandler { case DOUBLE: return new DoubleDictionary(dictionaryBuffer, cardinality); case STRING: - return new StringDictionary(dictionaryBuffer, cardinality, columnMetadata.getColumnMaxLength(), - (byte) columnMetadata.getPaddingCharacter()); + return new StringDictionary(dictionaryBuffer, cardinality, columnMetadata.getColumnMaxLength()); case BYTES: return new BytesDictionary(dictionaryBuffer, cardinality, columnMetadata.getColumnMaxLength()); default: diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/loader/columnminmaxvalue/ColumnMinMaxValueGenerator.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/loader/columnminmaxvalue/ColumnMinMaxValueGenerator.java index 7f3c97c16d..a3c2479bbc 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/loader/columnminmaxvalue/ColumnMinMaxValueGenerator.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/loader/columnminmaxvalue/ColumnMinMaxValueGenerator.java @@ -121,46 +121,40 @@ public class ColumnMinMaxValueGenerator { switch (dataType) { case INT: try (IntDictionary intDictionary = new IntDictionary(dictionaryBuffer, length)) { - SegmentColumnarIndexCreator - .addColumnMinMaxValueInfo(_segmentProperties, columnName, intDictionary.getStringValue(0), - intDictionary.getStringValue(length - 1)); + SegmentColumnarIndexCreator.addColumnMinMaxValueInfo(_segmentProperties, columnName, + intDictionary.getStringValue(0), intDictionary.getStringValue(length - 1)); } break; case LONG: try (LongDictionary longDictionary = new LongDictionary(dictionaryBuffer, length)) { - SegmentColumnarIndexCreator - .addColumnMinMaxValueInfo(_segmentProperties, columnName, longDictionary.getStringValue(0), - longDictionary.getStringValue(length - 1)); + SegmentColumnarIndexCreator.addColumnMinMaxValueInfo(_segmentProperties, columnName, + longDictionary.getStringValue(0), longDictionary.getStringValue(length - 1)); } break; case FLOAT: try (FloatDictionary floatDictionary = new FloatDictionary(dictionaryBuffer, length)) { - SegmentColumnarIndexCreator - .addColumnMinMaxValueInfo(_segmentProperties, columnName, floatDictionary.getStringValue(0), - floatDictionary.getStringValue(length - 1)); + SegmentColumnarIndexCreator.addColumnMinMaxValueInfo(_segmentProperties, columnName, + floatDictionary.getStringValue(0), floatDictionary.getStringValue(length - 1)); } break; case DOUBLE: try (DoubleDictionary doubleDictionary = new DoubleDictionary(dictionaryBuffer, length)) { - SegmentColumnarIndexCreator - .addColumnMinMaxValueInfo(_segmentProperties, columnName, doubleDictionary.getStringValue(0), - doubleDictionary.getStringValue(length - 1)); + SegmentColumnarIndexCreator.addColumnMinMaxValueInfo(_segmentProperties, columnName, + doubleDictionary.getStringValue(0), doubleDictionary.getStringValue(length - 1)); } break; case STRING: try (StringDictionary stringDictionary = new StringDictionary(dictionaryBuffer, length, - columnMetadata.getColumnMaxLength(), (byte) columnMetadata.getPaddingCharacter())) { - SegmentColumnarIndexCreator - .addColumnMinMaxValueInfo(_segmentProperties, columnName, stringDictionary.getStringValue(0), - stringDictionary.getStringValue(length - 1)); + columnMetadata.getColumnMaxLength())) { + SegmentColumnarIndexCreator.addColumnMinMaxValueInfo(_segmentProperties, columnName, + stringDictionary.getStringValue(0), stringDictionary.getStringValue(length - 1)); } break; case BYTES: try (BytesDictionary bytesDictionary = new BytesDictionary(dictionaryBuffer, length, columnMetadata.getColumnMaxLength())) { - SegmentColumnarIndexCreator - .addColumnMinMaxValueInfo(_segmentProperties, columnName, bytesDictionary.getStringValue(0), - bytesDictionary.getStringValue(length - 1)); + SegmentColumnarIndexCreator.addColumnMinMaxValueInfo(_segmentProperties, columnName, + bytesDictionary.getStringValue(0), bytesDictionary.getStringValue(length - 1)); } break; default: diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/BaseImmutableDictionary.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/BaseImmutableDictionary.java index f886038882..0c162e09f0 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/BaseImmutableDictionary.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/BaseImmutableDictionary.java @@ -22,7 +22,6 @@ import com.google.common.base.Preconditions; import it.unimi.dsi.fastutil.ints.IntSet; import java.io.IOException; import java.math.BigDecimal; -import java.util.Arrays; import org.apache.pinot.segment.local.io.util.FixedByteValueReaderWriter; import org.apache.pinot.segment.local.io.util.ValueReader; import org.apache.pinot.segment.local.io.util.VarLengthValueReader; @@ -41,21 +40,18 @@ public abstract class BaseImmutableDictionary implements Dictionary { private final ValueReader _valueReader; private final int _length; private final int _numBytesPerValue; - private final byte _paddingByte; - protected BaseImmutableDictionary(PinotDataBuffer dataBuffer, int length, int numBytesPerValue, byte paddingByte) { + protected BaseImmutableDictionary(PinotDataBuffer dataBuffer, int length, int numBytesPerValue) { if (VarLengthValueReader.isVarLengthValueBuffer(dataBuffer)) { VarLengthValueReader valueReader = new VarLengthValueReader(dataBuffer); _valueReader = valueReader; _length = valueReader.getNumValues(); - _paddingByte = 0; } else { Preconditions.checkState(dataBuffer.size() == (long) length * numBytesPerValue, "Buffer size mismatch: bufferSize = %s, numValues = %s, numByesPerValue = %s", dataBuffer.size(), length, numBytesPerValue); _valueReader = new FixedByteValueReaderWriter(dataBuffer); _length = length; - _paddingByte = paddingByte; } _numBytesPerValue = numBytesPerValue; } @@ -67,7 +63,6 @@ public abstract class BaseImmutableDictionary implements Dictionary { _valueReader = null; _length = length; _numBytesPerValue = -1; - _paddingByte = 0; } @Override @@ -211,44 +206,20 @@ public abstract class BaseImmutableDictionary implements Dictionary { return -(low + 1); } - /** - * WARNING: With non-zero padding byte, binary search result might not reflect the real insertion index for the value. - * E.g. with padding byte 'b', if unpadded value "aa" is in the dictionary, and stored as "aab", then unpadded value - * "a" will be mis-positioned after value "aa"; unpadded value "aab" will return positive value even if value "aab" is - * not in the dictionary. - * TODO: Clean up the segments with legacy non-zero padding byte, and remove the support for non-zero padding byte - */ protected int binarySearch(String value) { int low = 0; int high = _length - 1; - if (_paddingByte == 0) { - byte[] utf8 = value.getBytes(UTF_8); - while (low <= high) { - int mid = (low + high) >>> 1; - // method requires zero padding byte, not safe to call in nonzero padding byte branch below - int compareResult = _valueReader.compareUtf8Bytes(mid, _numBytesPerValue, utf8); - if (compareResult < 0) { - low = mid + 1; - } else if (compareResult > 0) { - high = mid - 1; - } else { - return mid; - } - } - } else { - byte[] buffer = getBuffer(); - String paddedValue = padString(value); - while (low <= high) { - int mid = (low + high) >>> 1; - String midValue = _valueReader.getPaddedString(mid, _numBytesPerValue, buffer); - int compareResult = midValue.compareTo(paddedValue); - if (compareResult < 0) { - low = mid + 1; - } else if (compareResult > 0) { - high = mid - 1; - } else { - return mid; - } + byte[] utf8 = value.getBytes(UTF_8); + while (low <= high) { + int mid = (low + high) >>> 1; + // method requires zero padding byte, not safe to call in nonzero padding byte branch below + int compareResult = _valueReader.compareUtf8Bytes(mid, _numBytesPerValue, utf8); + if (compareResult < 0) { + low = mid + 1; + } else if (compareResult > 0) { + high = mid - 1; + } else { + return mid; } } return -(low + 1); @@ -273,21 +244,6 @@ public abstract class BaseImmutableDictionary implements Dictionary { return -(low + 1); } - protected String padString(String value) { - byte[] valueBytes = value.getBytes(UTF_8); - int length = valueBytes.length; - String paddedValue; - if (length >= _numBytesPerValue) { - paddedValue = value; - } else { - byte[] paddedValueBytes = new byte[_numBytesPerValue]; - System.arraycopy(valueBytes, 0, paddedValueBytes, 0, length); - Arrays.fill(paddedValueBytes, length, _numBytesPerValue, _paddingByte); - paddedValue = new String(paddedValueBytes, UTF_8); - } - return paddedValue; - } - protected int getInt(int dictId) { return _valueReader.getInt(dictId); } @@ -309,11 +265,11 @@ public abstract class BaseImmutableDictionary implements Dictionary { } protected byte[] getUnpaddedBytes(int dictId, byte[] buffer) { - return _valueReader.getUnpaddedBytes(dictId, _numBytesPerValue, _paddingByte, buffer); + return _valueReader.getUnpaddedBytes(dictId, _numBytesPerValue, buffer); } protected String getUnpaddedString(int dictId, byte[] buffer) { - return _valueReader.getUnpaddedString(dictId, _numBytesPerValue, _paddingByte, buffer); + return _valueReader.getUnpaddedString(dictId, _numBytesPerValue, buffer); } protected String getPaddedString(int dictId, byte[] buffer) { diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/BigDecimalDictionary.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/BigDecimalDictionary.java index 991a7ddd5f..a6cfa817c6 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/BigDecimalDictionary.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/BigDecimalDictionary.java @@ -30,7 +30,7 @@ import org.apache.pinot.spi.utils.BigDecimalUtils; public class BigDecimalDictionary extends BaseImmutableDictionary { public BigDecimalDictionary(PinotDataBuffer dataBuffer, int length, int numBytesPerValue) { - super(dataBuffer, length, numBytesPerValue, (byte) 0); + super(dataBuffer, length, numBytesPerValue); } @Override diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/BytesDictionary.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/BytesDictionary.java index 0fc8049d7d..3528f1e74d 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/BytesDictionary.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/BytesDictionary.java @@ -32,7 +32,7 @@ import org.apache.pinot.spi.utils.BytesUtils; public class BytesDictionary extends BaseImmutableDictionary { public BytesDictionary(PinotDataBuffer dataBuffer, int length, int numBytesPerValue) { - super(dataBuffer, length, numBytesPerValue, (byte) 0); + super(dataBuffer, length, numBytesPerValue); } @Override diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/DoubleDictionary.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/DoubleDictionary.java index 005ab557f7..58d0a797a5 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/DoubleDictionary.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/DoubleDictionary.java @@ -26,7 +26,7 @@ import org.apache.pinot.spi.data.FieldSpec.DataType; public class DoubleDictionary extends BaseImmutableDictionary { public DoubleDictionary(PinotDataBuffer dataBuffer, int length) { - super(dataBuffer, length, Double.BYTES, (byte) 0); + super(dataBuffer, length, Double.BYTES); } @Override diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/FloatDictionary.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/FloatDictionary.java index df97837a12..85ac3ef08c 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/FloatDictionary.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/FloatDictionary.java @@ -26,7 +26,7 @@ import org.apache.pinot.spi.data.FieldSpec.DataType; public class FloatDictionary extends BaseImmutableDictionary { public FloatDictionary(PinotDataBuffer dataBuffer, int length) { - super(dataBuffer, length, Float.BYTES, (byte) 0); + super(dataBuffer, length, Float.BYTES); } @Override diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/IntDictionary.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/IntDictionary.java index aea1ff4c1a..368f0df1f4 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/IntDictionary.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/IntDictionary.java @@ -26,7 +26,7 @@ import org.apache.pinot.spi.data.FieldSpec.DataType; public class IntDictionary extends BaseImmutableDictionary { public IntDictionary(PinotDataBuffer dataBuffer, int length) { - super(dataBuffer, length, Integer.BYTES, (byte) 0); + super(dataBuffer, length, Integer.BYTES); } @Override diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/LongDictionary.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/LongDictionary.java index d57504c042..b5ab2ebe4a 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/LongDictionary.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/LongDictionary.java @@ -26,7 +26,7 @@ import org.apache.pinot.spi.data.FieldSpec.DataType; public class LongDictionary extends BaseImmutableDictionary { public LongDictionary(PinotDataBuffer dataBuffer, int length) { - super(dataBuffer, length, Long.BYTES, (byte) 0); + super(dataBuffer, length, Long.BYTES); } @Override diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/OnHeapBigDecimalDictionary.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/OnHeapBigDecimalDictionary.java index 00d071e117..4642254ff3 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/OnHeapBigDecimalDictionary.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/OnHeapBigDecimalDictionary.java @@ -39,7 +39,7 @@ public class OnHeapBigDecimalDictionary extends BaseImmutableDictionary { private final BigDecimal[] _dictIdToVal; public OnHeapBigDecimalDictionary(PinotDataBuffer dataBuffer, int length, int numBytesPerValue) { - super(dataBuffer, length, numBytesPerValue, (byte) 0); + super(dataBuffer, length, numBytesPerValue); _dictIdToVal = new BigDecimal[length]; for (int dictId = 0; dictId < length; dictId++) { diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/OnHeapBytesDictionary.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/OnHeapBytesDictionary.java index 7ba5ef51e6..ece5e31265 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/OnHeapBytesDictionary.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/OnHeapBytesDictionary.java @@ -42,7 +42,7 @@ public class OnHeapBytesDictionary extends BaseImmutableDictionary { private final ByteArray[] _dictIdToVal; public OnHeapBytesDictionary(PinotDataBuffer dataBuffer, int length, int numBytesPerValue) { - super(dataBuffer, length, numBytesPerValue, (byte) 0); + super(dataBuffer, length, numBytesPerValue); _valToDictId = new Object2IntOpenHashMap<>(length); _valToDictId.defaultReturnValue(Dictionary.NULL_VALUE_INDEX); diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/OnHeapDoubleDictionary.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/OnHeapDoubleDictionary.java index 61e06499aa..61bca0ea73 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/OnHeapDoubleDictionary.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/OnHeapDoubleDictionary.java @@ -47,7 +47,7 @@ public class OnHeapDoubleDictionary extends BaseImmutableDictionary { * @param length Length of the dictionary */ public OnHeapDoubleDictionary(PinotDataBuffer dataBuffer, int length) { - super(dataBuffer, length, Double.BYTES, (byte) 0); + super(dataBuffer, length, Double.BYTES); _valToDictId = new Double2IntOpenHashMap(length); _valToDictId.defaultReturnValue(Dictionary.NULL_VALUE_INDEX); diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/OnHeapFloatDictionary.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/OnHeapFloatDictionary.java index 581e61018c..4a6224eac0 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/OnHeapFloatDictionary.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/OnHeapFloatDictionary.java @@ -47,7 +47,7 @@ public class OnHeapFloatDictionary extends BaseImmutableDictionary { * @param length Length of the dictionary */ public OnHeapFloatDictionary(PinotDataBuffer dataBuffer, int length) { - super(dataBuffer, length, Float.BYTES, (byte) 0); + super(dataBuffer, length, Float.BYTES); _valToDictId = new Float2IntOpenHashMap(length); _valToDictId.defaultReturnValue(Dictionary.NULL_VALUE_INDEX); diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/OnHeapIntDictionary.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/OnHeapIntDictionary.java index 19cc196ade..ec597926b2 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/OnHeapIntDictionary.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/OnHeapIntDictionary.java @@ -47,7 +47,7 @@ public class OnHeapIntDictionary extends BaseImmutableDictionary { * @param length Length of the dictionary */ public OnHeapIntDictionary(PinotDataBuffer dataBuffer, int length) { - super(dataBuffer, length, Integer.BYTES, (byte) 0); + super(dataBuffer, length, Integer.BYTES); _valToDictId = new Int2IntOpenHashMap(length); _valToDictId.defaultReturnValue(Dictionary.NULL_VALUE_INDEX); diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/OnHeapLongDictionary.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/OnHeapLongDictionary.java index f122f3ed96..ae3162aa8e 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/OnHeapLongDictionary.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/OnHeapLongDictionary.java @@ -47,7 +47,7 @@ public class OnHeapLongDictionary extends BaseImmutableDictionary { * @param length Length of the dictionary */ public OnHeapLongDictionary(PinotDataBuffer dataBuffer, int length) { - super(dataBuffer, length, Long.BYTES, (byte) 0); + super(dataBuffer, length, Long.BYTES); _valToDictId = new Long2IntOpenHashMap(length); _valToDictId.defaultReturnValue(Dictionary.NULL_VALUE_INDEX); diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/OnHeapStringDictionary.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/OnHeapStringDictionary.java index 96b82d75f8..13ab350b7f 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/OnHeapStringDictionary.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/OnHeapStringDictionary.java @@ -38,38 +38,24 @@ import static java.nio.charset.StandardCharsets.UTF_8; * <p>This helps avoid creation of String from byte[], which is expensive as well as creates garbage. */ public class OnHeapStringDictionary extends BaseImmutableDictionary { - private final byte _paddingByte; private final String[] _unpaddedStrings; private final byte[][] _unpaddedBytes; - private final Object2IntOpenHashMap<String> _unPaddedStringToIdMap; - private final String[] _paddedStrings; - - public OnHeapStringDictionary(PinotDataBuffer dataBuffer, int length, int numBytesPerValue, byte paddingByte) { - super(dataBuffer, length, numBytesPerValue, paddingByte); - _paddingByte = paddingByte; - byte[] buffer = new byte[numBytesPerValue]; + public OnHeapStringDictionary(PinotDataBuffer dataBuffer, int length, int numBytesPerValue) { + super(dataBuffer, length, numBytesPerValue); _unpaddedBytes = new byte[length][]; _unpaddedStrings = new String[length]; _unPaddedStringToIdMap = new Object2IntOpenHashMap<>(length); _unPaddedStringToIdMap.defaultReturnValue(Dictionary.NULL_VALUE_INDEX); + byte[] buffer = new byte[numBytesPerValue]; for (int i = 0; i < length; i++) { _unpaddedBytes[i] = getUnpaddedBytes(i, buffer); _unpaddedStrings[i] = new String(_unpaddedBytes[i], UTF_8); _unPaddedStringToIdMap.put(_unpaddedStrings[i], i); } - - if (paddingByte == 0) { - _paddedStrings = null; - } else { - _paddedStrings = new String[length]; - for (int i = 0; i < length; i++) { - _paddedStrings[i] = getPaddedString(i, buffer); - } - } } @Override @@ -82,21 +68,13 @@ public class OnHeapStringDictionary extends BaseImmutableDictionary { return _unPaddedStringToIdMap.getInt(stringValue); } - /** - * WARNING: With non-zero padding byte, binary search result might not reflect the real insertion index for the value. - * E.g. with padding byte 'b', if unpadded value "aa" is in the dictionary, and stored as "aab", then unpadded value - * "a" will be mis-positioned after value "aa"; unpadded value "aab" will return positive value even if value "aab" is - * not in the dictionary. - * TODO: Clean up the segments with legacy non-zero padding byte, and remove the support for non-zero padding byte - */ @Override public int insertionIndexOf(String stringValue) { int index = _unPaddedStringToIdMap.getInt(stringValue); if (index != Dictionary.NULL_VALUE_INDEX) { return index; } else { - return _paddingByte == 0 ? Arrays.binarySearch(_unpaddedStrings, stringValue) - : Arrays.binarySearch(_paddedStrings, padString(stringValue)); + return Arrays.binarySearch(_unpaddedStrings, stringValue); } } diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/StringDictionary.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/StringDictionary.java index 15a3eefc82..502e4b79cc 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/StringDictionary.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/StringDictionary.java @@ -25,8 +25,8 @@ import org.apache.pinot.spi.data.FieldSpec.DataType; public class StringDictionary extends BaseImmutableDictionary { - public StringDictionary(PinotDataBuffer dataBuffer, int length, int numBytesPerValue, byte paddingByte) { - super(dataBuffer, length, numBytesPerValue, paddingByte); + public StringDictionary(PinotDataBuffer dataBuffer, int length, int numBytesPerValue) { + super(dataBuffer, length, numBytesPerValue); } @Override diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/json/ImmutableJsonIndexReader.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/json/ImmutableJsonIndexReader.java index 1adc20e00f..b1fea42680 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/json/ImmutableJsonIndexReader.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/json/ImmutableJsonIndexReader.java @@ -68,7 +68,7 @@ public class ImmutableJsonIndexReader implements JsonIndexReader { long dictionaryEndOffset = dictionaryStartOffset + dictionaryLength; _dictionary = new StringDictionary(dataBuffer.view(dictionaryStartOffset, dictionaryEndOffset, ByteOrder.BIG_ENDIAN), 0, - maxValueLength, (byte) 0); + maxValueLength); long invertedIndexEndOffset = dictionaryEndOffset + invertedIndexLength; _invertedIndex = new BitmapInvertedIndexReader( dataBuffer.view(dictionaryEndOffset, invertedIndexEndOffset, ByteOrder.BIG_ENDIAN), _dictionary.length()); diff --git a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/io/util/VarLengthValueReaderWriterTest.java b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/io/util/VarLengthValueReaderWriterTest.java index aef88fbe70..9c4dfd3b5b 100644 --- a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/io/util/VarLengthValueReaderWriterTest.java +++ b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/io/util/VarLengthValueReaderWriterTest.java @@ -80,7 +80,7 @@ public class VarLengthValueReaderWriterTest { try (VarLengthValueReader reader = new VarLengthValueReader(dataBuffer)) { assertEquals(reader.getNumValues(), 1); byte[] buffer = new byte[MAX_STRING_LENGTH]; - assertEquals(reader.getUnpaddedString(0, MAX_STRING_LENGTH, (byte) 0, buffer), value); + assertEquals(reader.getUnpaddedString(0, MAX_STRING_LENGTH, buffer), value); assertEquals(reader.getBytes(0, MAX_STRING_LENGTH), valueBytes); } } @@ -108,7 +108,7 @@ public class VarLengthValueReaderWriterTest { assertEquals(reader.getNumValues(), NUM_VALUES); byte[] buffer = new byte[MAX_STRING_LENGTH]; for (int i = 0; i < NUM_VALUES; i++) { - assertEquals(reader.getUnpaddedString(i, MAX_STRING_LENGTH, (byte) 0, buffer), values[i]); + assertEquals(reader.getUnpaddedString(i, MAX_STRING_LENGTH, buffer), values[i]); assertEquals(reader.getBytes(i, MAX_STRING_LENGTH), valueBytesArray[i]); } } diff --git a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/loader/LoaderTest.java b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/loader/LoaderTest.java index c64cf6c614..50b8489fbd 100644 --- a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/loader/LoaderTest.java +++ b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/loader/LoaderTest.java @@ -28,13 +28,10 @@ import java.util.List; import java.util.Map; import java.util.Set; import org.apache.commons.io.FileUtils; -import org.apache.pinot.common.utils.TarGzCompressionUtils; import org.apache.pinot.segment.local.indexsegment.immutable.ImmutableSegmentLoader; import org.apache.pinot.segment.local.segment.creator.SegmentTestUtils; import org.apache.pinot.segment.local.segment.creator.impl.SegmentCreationDriverFactory; import org.apache.pinot.segment.local.segment.index.converter.SegmentV1V2ToV3FormatConverter; -import org.apache.pinot.segment.local.segment.index.readers.StringDictionary; -import org.apache.pinot.segment.spi.ColumnMetadata; import org.apache.pinot.segment.spi.ImmutableSegment; import org.apache.pinot.segment.spi.IndexSegment; import org.apache.pinot.segment.spi.V1Constants; @@ -45,7 +42,6 @@ import org.apache.pinot.segment.spi.index.metadata.SegmentMetadataImpl; import org.apache.pinot.segment.spi.loader.SegmentDirectoryLoader; import org.apache.pinot.segment.spi.loader.SegmentDirectoryLoaderContext; import org.apache.pinot.segment.spi.loader.SegmentDirectoryLoaderRegistry; -import org.apache.pinot.segment.spi.memory.PinotDataBuffer; import org.apache.pinot.segment.spi.store.ColumnIndexType; import org.apache.pinot.segment.spi.store.SegmentDirectory; import org.apache.pinot.segment.spi.store.SegmentDirectoryPaths; @@ -57,7 +53,6 @@ import org.apache.pinot.spi.env.PinotConfiguration; import org.apache.pinot.spi.utils.BytesUtils; import org.apache.pinot.spi.utils.CommonConstants.Segment.BuiltInVirtualColumn; import org.apache.pinot.spi.utils.ReadMode; -import org.apache.pinot.util.TestUtils; import org.testng.Assert; import org.testng.annotations.AfterClass; import org.testng.annotations.BeforeClass; @@ -73,9 +68,6 @@ import static org.testng.Assert.assertTrue; public class LoaderTest { private static final File INDEX_DIR = new File(LoaderTest.class.getName()); private static final String AVRO_DATA = "data/test_data-mv.avro"; - private static final String PADDING_OLD = "data/paddingOld.tar.gz"; - private static final String PADDING_PERCENT = "data/paddingPercent.tar.gz"; - private static final String PADDING_NULL = "data/paddingNull.tar.gz"; private static final String TEXT_INDEX_COL_NAME = "column5"; private static final String FST_INDEX_COL_NAME = "column5"; @@ -222,77 +214,6 @@ public class LoaderTest { Assert.assertNotNull(indexSegment.getDataSource(BuiltInVirtualColumn.SEGMENTNAME)); } - @Test - public void testPadding() - throws Exception { - // Old Format - URL resourceUrl = LoaderTest.class.getClassLoader().getResource(PADDING_OLD); - Assert.assertNotNull(resourceUrl); - File segmentDirectory = - TarGzCompressionUtils.untar(new File(TestUtils.getFileFromResourceUrl(resourceUrl)), INDEX_DIR).get(0); - SegmentMetadataImpl segmentMetadata = new SegmentMetadataImpl(segmentDirectory); - ColumnMetadata columnMetadata = segmentMetadata.getColumnMetadataFor("name"); - Assert.assertEquals(columnMetadata.getPaddingCharacter(), V1Constants.Str.LEGACY_STRING_PAD_CHAR); - SegmentDirectory segmentDir = _localSegmentDirectoryLoader.load(segmentDirectory.toURI(), - new SegmentDirectoryLoaderContext.Builder().setSegmentName(segmentMetadata.getName()) - .setSegmentDirectoryConfigs(_pinotConfiguration).build()); - SegmentDirectory.Reader reader = segmentDir.createReader(); - PinotDataBuffer dictionaryBuffer = reader.getIndexFor("name", ColumnIndexType.DICTIONARY); - StringDictionary dict = - new StringDictionary(dictionaryBuffer, columnMetadata.getCardinality(), columnMetadata.getColumnMaxLength(), - (byte) columnMetadata.getPaddingCharacter()); - Assert.assertEquals(dict.getStringValue(0), "lynda 2.0"); - Assert.assertEquals(dict.getStringValue(1), "lynda"); - Assert.assertEquals(dict.get(0), "lynda 2.0"); - Assert.assertEquals(dict.get(1), "lynda"); - Assert.assertEquals(dict.indexOf("lynda%"), 1); - Assert.assertEquals(dict.indexOf("lynda%%"), 1); - - // New Format Padding character % - resourceUrl = LoaderTest.class.getClassLoader().getResource(PADDING_PERCENT); - Assert.assertNotNull(resourceUrl); - segmentDirectory = - TarGzCompressionUtils.untar(new File(TestUtils.getFileFromResourceUrl(resourceUrl)), INDEX_DIR).get(0); - segmentMetadata = new SegmentMetadataImpl(segmentDirectory); - columnMetadata = segmentMetadata.getColumnMetadataFor("name"); - Assert.assertEquals(columnMetadata.getPaddingCharacter(), V1Constants.Str.LEGACY_STRING_PAD_CHAR); - segmentDir = _localSegmentDirectoryLoader.load(segmentDirectory.toURI(), - new SegmentDirectoryLoaderContext.Builder().setSegmentName(segmentMetadata.getName()) - .setSegmentDirectoryConfigs(_pinotConfiguration).build()); - reader = segmentDir.createReader(); - dictionaryBuffer = reader.getIndexFor("name", ColumnIndexType.DICTIONARY); - dict = new StringDictionary(dictionaryBuffer, columnMetadata.getCardinality(), columnMetadata.getColumnMaxLength(), - (byte) columnMetadata.getPaddingCharacter()); - Assert.assertEquals(dict.getStringValue(0), "lynda 2.0"); - Assert.assertEquals(dict.getStringValue(1), "lynda"); - Assert.assertEquals(dict.get(0), "lynda 2.0"); - Assert.assertEquals(dict.get(1), "lynda"); - Assert.assertEquals(dict.indexOf("lynda%"), 1); - Assert.assertEquals(dict.indexOf("lynda%%"), 1); - - // New Format Padding character Null - resourceUrl = LoaderTest.class.getClassLoader().getResource(PADDING_NULL); - Assert.assertNotNull(resourceUrl); - segmentDirectory = - TarGzCompressionUtils.untar(new File(TestUtils.getFileFromResourceUrl(resourceUrl)), INDEX_DIR).get(0); - segmentMetadata = new SegmentMetadataImpl(segmentDirectory); - columnMetadata = segmentMetadata.getColumnMetadataFor("name"); - Assert.assertEquals(columnMetadata.getPaddingCharacter(), V1Constants.Str.DEFAULT_STRING_PAD_CHAR); - segmentDir = _localSegmentDirectoryLoader.load(segmentDirectory.toURI(), - new SegmentDirectoryLoaderContext.Builder().setSegmentName(segmentMetadata.getName()) - .setSegmentDirectoryConfigs(_pinotConfiguration).build()); - reader = segmentDir.createReader(); - dictionaryBuffer = reader.getIndexFor("name", ColumnIndexType.DICTIONARY); - dict = new StringDictionary(dictionaryBuffer, columnMetadata.getCardinality(), columnMetadata.getColumnMaxLength(), - (byte) columnMetadata.getPaddingCharacter()); - Assert.assertEquals(dict.getStringValue(0), "lynda"); - Assert.assertEquals(dict.getStringValue(1), "lynda 2.0"); - Assert.assertEquals(dict.get(0), "lynda"); - Assert.assertEquals(dict.get(1), "lynda 2.0"); - Assert.assertEquals(dict.insertionIndexOf("lynda\0"), -2); - Assert.assertEquals(dict.insertionIndexOf("lynda\0\0"), -2); - } - /** * Tests loading default string column with empty ("") default null value. */ diff --git a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/readers/ImmutableDictionaryTest.java b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/readers/ImmutableDictionaryTest.java index 4344e39970..bdab861630 100644 --- a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/readers/ImmutableDictionaryTest.java +++ b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/readers/ImmutableDictionaryTest.java @@ -360,8 +360,8 @@ public class ImmutableDictionaryTest { public void testStringDictionary() throws Exception { try (StringDictionary stringDictionary = new StringDictionary(PinotDataBuffer.mapReadOnlyBigEndianFile( - new File(TEMP_DIR, STRING_COLUMN_NAME + V1Constants.Dict.FILE_EXTENSION)), NUM_VALUES, _numBytesPerStringValue, - (byte) 0)) { + new File(TEMP_DIR, STRING_COLUMN_NAME + V1Constants.Dict.FILE_EXTENSION)), NUM_VALUES, + _numBytesPerStringValue)) { testStringDictionary(stringDictionary); } } @@ -372,7 +372,7 @@ public class ImmutableDictionaryTest { try (OnHeapStringDictionary onHeapStringDictionary = new OnHeapStringDictionary( PinotDataBuffer.mapReadOnlyBigEndianFile( new File(TEMP_DIR, STRING_COLUMN_NAME + V1Constants.Dict.FILE_EXTENSION)), NUM_VALUES, - _numBytesPerStringValue, (byte) 0)) { + _numBytesPerStringValue)) { testStringDictionary(onHeapStringDictionary); } } diff --git a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/readers/ImmutableDictionaryTypeConversionTest.java b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/readers/ImmutableDictionaryTypeConversionTest.java index f4997e6bb1..303f87b520 100644 --- a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/readers/ImmutableDictionaryTypeConversionTest.java +++ b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/readers/ImmutableDictionaryTypeConversionTest.java @@ -312,8 +312,7 @@ public class ImmutableDictionaryTypeConversionTest { public void testStringDictionary() throws Exception { try (StringDictionary stringDictionary = new StringDictionary(PinotDataBuffer.mapReadOnlyBigEndianFile( - new File(TEMP_DIR, STRING_COLUMN_NAME + V1Constants.Dict.FILE_EXTENSION)), NUM_VALUES, STRING_LENGTH, - (byte) 0)) { + new File(TEMP_DIR, STRING_COLUMN_NAME + V1Constants.Dict.FILE_EXTENSION)), NUM_VALUES, STRING_LENGTH)) { testStringDictionary(stringDictionary); } } @@ -323,8 +322,7 @@ public class ImmutableDictionaryTypeConversionTest { throws Exception { try (OnHeapStringDictionary onHeapStringDictionary = new OnHeapStringDictionary( PinotDataBuffer.mapReadOnlyBigEndianFile( - new File(TEMP_DIR, STRING_COLUMN_NAME + V1Constants.Dict.FILE_EXTENSION)), NUM_VALUES, STRING_LENGTH, - (byte) 0)) { + new File(TEMP_DIR, STRING_COLUMN_NAME + V1Constants.Dict.FILE_EXTENSION)), NUM_VALUES, STRING_LENGTH)) { testStringDictionary(onHeapStringDictionary); } } diff --git a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/readerwriter/FixedByteValueReaderWriterTest.java b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/readerwriter/FixedByteValueReaderWriterTest.java index 28d0a8ecc1..5e6a5c673d 100644 --- a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/readerwriter/FixedByteValueReaderWriterTest.java +++ b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/readerwriter/FixedByteValueReaderWriterTest.java @@ -56,7 +56,6 @@ public class FixedByteValueReaderWriterTest { @Test(dataProvider = "params") public void testFixedByteValueReaderWriter(int maxStringLength, int configuredMaxLength, ByteOrder byteOrder) throws IOException { - byte nt = 0; byte[] bytes = new byte[configuredMaxLength]; try (PinotDataBuffer buffer = PinotDataBuffer.allocateDirect(configuredMaxLength * 1000L, byteOrder, "testFixedByteValueReaderWriter")) { @@ -67,10 +66,10 @@ public class FixedByteValueReaderWriterTest { Arrays.fill(bytes, 0, length, (byte) 'a'); readerWriter.writeBytes(i, configuredMaxLength, bytes); inputs.add(new String(bytes, 0, length, StandardCharsets.UTF_8)); - Arrays.fill(bytes, nt); + Arrays.fill(bytes, 0, length, (byte) 0); } for (int i = 0; i < 1000; i++) { - assertEquals(readerWriter.getUnpaddedString(i, configuredMaxLength, nt, bytes), inputs.get(i)); + assertEquals(readerWriter.getUnpaddedString(i, configuredMaxLength, bytes), inputs.get(i)); } } } diff --git a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/readerwriter/ValueReaderComparisonTest.java b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/readerwriter/ValueReaderComparisonTest.java index dbd18601f7..80462e2f34 100644 --- a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/readerwriter/ValueReaderComparisonTest.java +++ b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/readerwriter/ValueReaderComparisonTest.java @@ -301,19 +301,19 @@ public class ValueReaderComparisonTest { private static void assertUtf8Comparison(ValueReader readerWriter, int index, int numBytesPerValue, String string, int signum) { byte[] value = string.getBytes(StandardCharsets.UTF_8); - String stored = readerWriter.getUnpaddedString(index, numBytesPerValue, (byte) 0, new byte[numBytesPerValue]); + String stored = readerWriter.getUnpaddedString(index, numBytesPerValue, new byte[numBytesPerValue]); String error = stored + " " + string; int comparisonViaMaterialization = stored.compareTo(string); - assertEquals(signum, Integer.compare(comparisonViaMaterialization, 0), error); + assertEquals(Integer.compare(comparisonViaMaterialization, 0), signum, error); int utf8Comparison = readerWriter.compareUtf8Bytes(index, numBytesPerValue, value); - assertEquals(signum, Integer.compare(utf8Comparison, 0), error); + assertEquals(Integer.compare(utf8Comparison, 0), signum, error); } private static void assertConsistentUtf8Comparison(ValueReader readerWriter, int index, int numBytesPerValue, String string) { byte[] value = string.getBytes(StandardCharsets.UTF_8); int utf8Comparison = readerWriter.compareUtf8Bytes(index, numBytesPerValue, value); - String stored = readerWriter.getUnpaddedString(index, numBytesPerValue, (byte) 0, new byte[numBytesPerValue]); + String stored = readerWriter.getUnpaddedString(index, numBytesPerValue, new byte[numBytesPerValue]); int comparisonViaMaterialization = stored.compareTo(string); assertTrue( (utf8Comparison == comparisonViaMaterialization) || (utf8Comparison < 0 && comparisonViaMaterialization < 0) diff --git a/pinot-segment-local/src/test/resources/data/paddingNull.tar.gz b/pinot-segment-local/src/test/resources/data/paddingNull.tar.gz deleted file mode 100644 index 628d54658e..0000000000 Binary files a/pinot-segment-local/src/test/resources/data/paddingNull.tar.gz and /dev/null differ diff --git a/pinot-segment-local/src/test/resources/data/paddingOld.tar.gz b/pinot-segment-local/src/test/resources/data/paddingOld.tar.gz deleted file mode 100644 index 2d2ba4a06e..0000000000 Binary files a/pinot-segment-local/src/test/resources/data/paddingOld.tar.gz and /dev/null differ diff --git a/pinot-segment-local/src/test/resources/data/paddingPercent.tar.gz b/pinot-segment-local/src/test/resources/data/paddingPercent.tar.gz deleted file mode 100644 index 429241b691..0000000000 Binary files a/pinot-segment-local/src/test/resources/data/paddingPercent.tar.gz and /dev/null differ diff --git a/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/ColumnMetadata.java b/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/ColumnMetadata.java index 4346d1f744..620712e307 100644 --- a/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/ColumnMetadata.java +++ b/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/ColumnMetadata.java @@ -78,8 +78,6 @@ public interface ColumnMetadata { int getColumnMaxLength(); - char getPaddingCharacter(); - int getBitsPerElement(); int getMaxNumberOfMultiValues(); diff --git a/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/V1Constants.java b/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/V1Constants.java index c9d4213b63..38d2e21264 100644 --- a/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/V1Constants.java +++ b/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/V1Constants.java @@ -29,7 +29,6 @@ public class V1Constants { public static class Str { public static final char DEFAULT_STRING_PAD_CHAR = '\0'; - public static final char LEGACY_STRING_PAD_CHAR = '%'; } public static class Dict { diff --git a/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/index/metadata/ColumnMetadataImpl.java b/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/index/metadata/ColumnMetadataImpl.java index 6b0b3d72a9..dec639198b 100644 --- a/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/index/metadata/ColumnMetadataImpl.java +++ b/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/index/metadata/ColumnMetadataImpl.java @@ -18,6 +18,7 @@ */ package org.apache.pinot.segment.spi.index.metadata; +import com.google.common.base.Preconditions; import java.math.BigDecimal; import java.util.HashMap; import java.util.Iterator; @@ -29,7 +30,6 @@ import java.util.concurrent.TimeUnit; import javax.annotation.Nullable; import org.apache.commons.configuration.Configuration; import org.apache.commons.configuration.PropertiesConfiguration; -import org.apache.commons.lang.StringEscapeUtils; import org.apache.pinot.segment.spi.ColumnMetadata; import org.apache.pinot.segment.spi.V1Constants; import org.apache.pinot.segment.spi.V1Constants.MetadataKeys.Column; @@ -60,7 +60,6 @@ public class ColumnMetadataImpl implements ColumnMetadata { private final boolean _minMaxValueInvalid; private final boolean _hasDictionary; private final int _columnMaxLength; - private final char _paddingCharacter; private final int _bitsPerElement; private final int _maxNumberOfMultiValues; private final int _totalNumberOfEntries; @@ -71,8 +70,8 @@ public class ColumnMetadataImpl implements ColumnMetadata { private ColumnMetadataImpl(FieldSpec fieldSpec, int totalDocs, int cardinality, boolean sorted, Comparable<?> minValue, Comparable<?> maxValue, boolean minMaxValueInvalid, boolean hasDictionary, - int columnMaxLength, char paddingCharacter, int bitsPerElement, int maxNumberOfMultiValues, - int totalNumberOfEntries, @Nullable PartitionFunction partitionFunction, @Nullable Set<Integer> partitions, + int columnMaxLength, int bitsPerElement, int maxNumberOfMultiValues, int totalNumberOfEntries, + @Nullable PartitionFunction partitionFunction, @Nullable Set<Integer> partitions, Map<ColumnIndexType, Long> indexSizeMap, boolean autoGenerated) { _fieldSpec = fieldSpec; _totalDocs = totalDocs; @@ -83,7 +82,6 @@ public class ColumnMetadataImpl implements ColumnMetadata { _minMaxValueInvalid = minMaxValueInvalid; _hasDictionary = hasDictionary; _columnMaxLength = columnMaxLength; - _paddingCharacter = paddingCharacter; _bitsPerElement = bitsPerElement; _maxNumberOfMultiValues = maxNumberOfMultiValues; _totalNumberOfEntries = totalNumberOfEntries; @@ -137,11 +135,6 @@ public class ColumnMetadataImpl implements ColumnMetadata { return _columnMaxLength; } - @Override - public char getPaddingCharacter() { - return _paddingCharacter; - } - @Override public int getBitsPerElement() { return _bitsPerElement; @@ -191,30 +184,28 @@ public class ColumnMetadataImpl implements ColumnMetadata { ColumnMetadataImpl that = (ColumnMetadataImpl) o; return _totalDocs == that._totalDocs && _cardinality == that._cardinality && _sorted == that._sorted && _hasDictionary == that._hasDictionary && _columnMaxLength == that._columnMaxLength - && _paddingCharacter == that._paddingCharacter && _bitsPerElement == that._bitsPerElement - && _maxNumberOfMultiValues == that._maxNumberOfMultiValues - && _totalNumberOfEntries == that._totalNumberOfEntries && _autoGenerated == that._autoGenerated && Objects - .equals(_fieldSpec, that._fieldSpec) && Objects.equals(_minValue, that._minValue) && Objects - .equals(_maxValue, that._maxValue) && Objects.equals(_partitionFunction, that._partitionFunction) && Objects - .equals(_partitions, that._partitions); + && _bitsPerElement == that._bitsPerElement && _maxNumberOfMultiValues == that._maxNumberOfMultiValues + && _totalNumberOfEntries == that._totalNumberOfEntries && _autoGenerated == that._autoGenerated + && Objects.equals(_fieldSpec, that._fieldSpec) && Objects.equals(_minValue, that._minValue) && Objects.equals( + _maxValue, that._maxValue) && Objects.equals(_partitionFunction, that._partitionFunction) && Objects.equals( + _partitions, that._partitions); } @Override public int hashCode() { - return Objects - .hash(_fieldSpec, _totalDocs, _cardinality, _sorted, _minValue, _maxValue, _hasDictionary, _columnMaxLength, - _paddingCharacter, _bitsPerElement, _maxNumberOfMultiValues, _totalNumberOfEntries, _partitionFunction, - _partitions, _autoGenerated); + return Objects.hash(_fieldSpec, _totalDocs, _cardinality, _sorted, _minValue, _maxValue, _hasDictionary, + _columnMaxLength, _bitsPerElement, _maxNumberOfMultiValues, _totalNumberOfEntries, _partitionFunction, + _partitions, _autoGenerated); } @Override public String toString() { return "ColumnMetadataImpl{" + "_fieldSpec=" + _fieldSpec + ", _totalDocs=" + _totalDocs + ", _cardinality=" + _cardinality + ", _sorted=" + _sorted + ", _minValue=" + _minValue + ", _maxValue=" + _maxValue - + ", _hasDictionary=" + _hasDictionary + ", _columnMaxLength=" + _columnMaxLength + ", _paddingCharacter=" - + _paddingCharacter + ", _bitsPerElement=" + _bitsPerElement + ", _maxNumberOfMultiValues=" - + _maxNumberOfMultiValues + ", _totalNumberOfEntries=" + _totalNumberOfEntries + ", _partitionFunction=" - + _partitionFunction + ", _partitions=" + _partitions + ", _autoGenerated=" + _autoGenerated + '}'; + + ", _hasDictionary=" + _hasDictionary + ", _columnMaxLength=" + _columnMaxLength + ", _bitsPerElement=" + + _bitsPerElement + ", _maxNumberOfMultiValues=" + _maxNumberOfMultiValues + ", _totalNumberOfEntries=" + + _totalNumberOfEntries + ", _partitionFunction=" + _partitionFunction + ", _partitions=" + _partitions + + ", _autoGenerated=" + _autoGenerated + '}'; } public static ColumnMetadataImpl fromPropertiesConfiguration(String column, PropertiesConfiguration config) { @@ -297,12 +288,10 @@ public class ColumnMetadataImpl implements ColumnMetadata { } builder.setMinMaxValueInvalid(config.getBoolean(Column.getKeyFor(column, Column.MIN_MAX_VALUE_INVALID), false)); - char paddingCharacter = V1Constants.Str.LEGACY_STRING_PAD_CHAR; + // Only support zero padding String padding = config.getString(Segment.SEGMENT_PADDING_CHARACTER, null); - if (padding != null) { - paddingCharacter = StringEscapeUtils.unescapeJava(padding).charAt(0); - } - builder.setPaddingCharacter(paddingCharacter); + Preconditions.checkState(String.valueOf(V1Constants.Str.DEFAULT_STRING_PAD_CHAR).equals(padding), + "Got non-zero string padding: %s", padding); String partitionFunctionName = config.getString(Column.getKeyFor(column, Column.PARTITION_FUNCTION), null); if (partitionFunctionName != null) { @@ -350,7 +339,6 @@ public class ColumnMetadataImpl implements ColumnMetadata { private boolean _minMaxValueInvalid; private boolean _hasDictionary; private int _columnMaxLength; - private char _paddingCharacter; private int _bitsPerElement; private int _maxNumberOfMultiValues; private int _totalNumberOfEntries; @@ -404,11 +392,6 @@ public class ColumnMetadataImpl implements ColumnMetadata { return this; } - public Builder setPaddingCharacter(char paddingCharacter) { - _paddingCharacter = paddingCharacter; - return this; - } - public Builder setBitsPerElement(int bitsPerElement) { _bitsPerElement = bitsPerElement; return this; @@ -445,9 +428,8 @@ public class ColumnMetadataImpl implements ColumnMetadata { public ColumnMetadataImpl build() { return new ColumnMetadataImpl(_fieldSpec, _totalDocs, _cardinality, _sorted, _minValue, _maxValue, - _minMaxValueInvalid, _hasDictionary, _columnMaxLength, _paddingCharacter, _bitsPerElement, - _maxNumberOfMultiValues, _totalNumberOfEntries, _partitionFunction, _partitions, _indexSizeMap, - _autoGenerated); + _minMaxValueInvalid, _hasDictionary, _columnMaxLength, _bitsPerElement, _maxNumberOfMultiValues, + _totalNumberOfEntries, _partitionFunction, _partitions, _indexSizeMap, _autoGenerated); } } } --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@pinot.apache.org For additional commands, e-mail: commits-h...@pinot.apache.org