This is an automated email from the ASF dual-hosted git repository.
xiangfu pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/pinot.git
The following commit(s) were added to refs/heads/master by this push:
new 5172ada0e4f segment-local: NoDict hybrid cardinality (exact ≤ 2,048 →
HLL++ +10% cap) (#17186)
5172ada0e4f is described below
commit 5172ada0e4ff343dab14441a4aa3322989b2dd45
Author: Xiang Fu <[email protected]>
AuthorDate: Thu Nov 13 11:19:52 2025 -0800
segment-local: NoDict hybrid cardinality (exact ≤ 2,048 → HLL++ +10% cap)
(#17186)
- Track exact uniques up to 2,048; return exact count while active
- On threshold exceed, drop exact set and estimate via HLL++
- Apply +10% upward buffer, clamped to total entries, to avoid
underestimation
- Reduce small-N flakiness; preserve safety at scale
Tests
- SegmentPreProcessorTest: expect exact small-N cardinality 5 for derived
raw string
- NoDictColumnStatisticsCollectorTest: assertion message refactor;
tolerance unchanged
Implementation notes
- Add EXACT_UNIQUE_TRACKING_THRESHOLD and trackExactUnique()
- Normalize keys for byte[] and BigDecimal during exact tracking
---
.../stats/NoDictColumnStatisticsCollector.java | 45 ++++++++++++++++++++--
.../stats/NoDictColumnStatisticsCollectorTest.java | 8 ++--
.../index/loader/SegmentPreProcessorTest.java | 11 +++---
3 files changed, 53 insertions(+), 11 deletions(-)
diff --git
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/stats/NoDictColumnStatisticsCollector.java
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/stats/NoDictColumnStatisticsCollector.java
index 4962bd95db4..f047c56c479 100644
---
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/stats/NoDictColumnStatisticsCollector.java
+++
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/stats/NoDictColumnStatisticsCollector.java
@@ -21,6 +21,8 @@ package
org.apache.pinot.segment.local.segment.creator.impl.stats;
import com.clearspring.analytics.stream.cardinality.HyperLogLogPlus;
import java.math.BigDecimal;
import java.nio.charset.StandardCharsets;
+import java.util.HashSet;
+import java.util.Set;
import org.apache.commons.lang3.NotImplementedException;
import org.apache.pinot.segment.spi.creator.StatsCollectorConfig;
import org.apache.pinot.spi.utils.BigDecimalUtils;
@@ -52,6 +54,10 @@ public class NoDictColumnStatisticsCollector extends
AbstractColumnStatisticsCol
private boolean _sealed = false;
// HLL Plus generally returns approximate cardinality >= actual cardinality
which is desired
private final HyperLogLogPlus _hllPlus;
+ // Track exact uniques up to a threshold to avoid small-N underestimation
and test flakiness
+ private static final int EXACT_UNIQUE_TRACKING_THRESHOLD = 2048;
+ // Single-threaded: simple Set is sufficient; set to null once threshold
exceeded to cap memory
+ private Set<Object> _exactUniques = new HashSet<>();
public NoDictColumnStatisticsCollector(String column, StatsCollectorConfig
statsCollectorConfig) {
super(column, statsCollectorConfig);
@@ -75,6 +81,7 @@ public class NoDictColumnStatisticsCollector extends
AbstractColumnStatisticsCol
}
updateMinMax(value);
updateHllPlus(value);
+ trackExactUnique(value);
int len = getValueLength(value);
_minLength = Math.min(_minLength, len);
_maxLength = Math.max(_maxLength, len);
@@ -92,6 +99,7 @@ public class NoDictColumnStatisticsCollector extends
AbstractColumnStatisticsCol
for (int value : values) {
updateMinMax(value);
updateHllPlus(value);
+ trackExactUnique(value);
}
length = values.length;
} else if (entry instanceof long[]) {
@@ -99,6 +107,7 @@ public class NoDictColumnStatisticsCollector extends
AbstractColumnStatisticsCol
for (long value : values) {
updateMinMax(value);
updateHllPlus(value);
+ trackExactUnique(value);
}
length = values.length;
} else if (entry instanceof float[]) {
@@ -106,6 +115,7 @@ public class NoDictColumnStatisticsCollector extends
AbstractColumnStatisticsCol
for (float value : values) {
updateMinMax(value);
updateHllPlus(value);
+ trackExactUnique(value);
}
length = values.length;
} else {
@@ -113,6 +123,7 @@ public class NoDictColumnStatisticsCollector extends
AbstractColumnStatisticsCol
for (double value : values) {
updateMinMax(value);
updateHllPlus(value);
+ trackExactUnique(value);
}
length = values.length;
}
@@ -123,6 +134,7 @@ public class NoDictColumnStatisticsCollector extends
AbstractColumnStatisticsCol
addressSorted(value);
updateMinMax(entry);
updateHllPlus(entry);
+ trackExactUnique(entry);
int len = getValueLength(entry);
_minLength = Math.min(_minLength, len);
_maxLength = Math.max(_maxLength, len);
@@ -208,8 +220,11 @@ public class NoDictColumnStatisticsCollector extends
AbstractColumnStatisticsCol
@Override
public int getCardinality() {
- // Get approximate distinct count estimate using HLL++
- // Increase by 10% to increase probability of not returning lower than
actual cardinality
+ // If we are still tracking exact uniques, return exact cardinality
+ if (_exactUniques != null) {
+ return _exactUniques.size();
+ }
+ // Get approximate distinct count estimate using HLL++ with a 10% upward
buffer
long estimate = Math.round(_hllPlus.cardinality() * 1.1);
// There are cases where approximation can overshoot the actual number of
entries.
// Returning a cardinality greater than total entries can break
assumptions.
@@ -223,10 +238,34 @@ public class NoDictColumnStatisticsCollector extends
AbstractColumnStatisticsCol
private void updateHllPlus(Object value) {
if (value instanceof BigDecimal) {
- // Canonicalize BigDecimal as string to avoid scale-related equality
issues
+ // Canonicalize BigDecimal as string to avoid scale-related equality
issues:
+ // BigDecimals with different scales (e.g., 1.0 vs 1.00) are not equal
by default,
+ // but their string representations normalize the value for cardinality
tracking.
_hllPlus.offer(((BigDecimal) value).toString());
} else {
_hllPlus.offer(value);
}
}
+
+ private void trackExactUnique(Object value) {
+ if (_exactUniques == null) {
+ return;
+ }
+ Object key;
+ if (value instanceof byte[]) {
+ key = new ByteArray((byte[]) value);
+ } else if (value instanceof BigDecimal) {
+ // Use string representation to avoid scale-related equality issues:
+ // BigDecimals with different scales (e.g., 1.0 vs 1.00) are not equal
by default,
+ // but their string representations normalize the value for cardinality
tracking.
+ key = ((BigDecimal) value).toString();
+ } else {
+ key = value;
+ }
+ _exactUniques.add(key);
+ if (_exactUniques.size() > EXACT_UNIQUE_TRACKING_THRESHOLD) {
+ // Drop exact tracking once the threshold is exceeded to avoid unbounded
memory.
+ _exactUniques = null;
+ }
+ }
}
diff --git
a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/creator/impl/stats/NoDictColumnStatisticsCollectorTest.java
b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/creator/impl/stats/NoDictColumnStatisticsCollectorTest.java
index 3600f2f0562..546722c231f 100644
---
a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/creator/impl/stats/NoDictColumnStatisticsCollectorTest.java
+++
b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/creator/impl/stats/NoDictColumnStatisticsCollectorTest.java
@@ -781,9 +781,11 @@ public class NoDictColumnStatisticsCollectorTest {
String context = String.format("Iteration %d, DataType: %s, SingleValue:
%s, Data: %s",
iteration, dataType, isSingleValue, Arrays.deepToString(testData));
- assertTrue(noDictCollector.getCardinality() >=
expectedCollector.getCardinality(),
- "Approx Cardinality " + noDictCollector.getCardinality() + " is lower
than actual cardinality "
- + expectedCollector.getCardinality() + " for " + context);
+ int approx = noDictCollector.getCardinality();
+ int exact = expectedCollector.getCardinality();
+ assertTrue(approx >= exact,
+ "Approx Cardinality " + approx + " is lower than actual cardinality "
+ + exact + " for " + context);
assertEquals(noDictCollector.getMinValue(),
expectedCollector.getMinValue(),
"MinValue mismatch - " + context);
assertEquals(noDictCollector.getMaxValue(),
expectedCollector.getMaxValue(),
diff --git
a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/loader/SegmentPreProcessorTest.java
b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/loader/SegmentPreProcessorTest.java
index 0263369bd10..4807763233a 100644
---
a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/loader/SegmentPreProcessorTest.java
+++
b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/loader/SegmentPreProcessorTest.java
@@ -391,7 +391,7 @@ public class SegmentPreProcessorTest implements
PinotBuffersAfterClassCheckRule
public void testSimpleEnableDictionarySV()
throws Exception {
int approxCardinality = 46934; // derived via
NoDictColumnStatisticsCollector
- int approxCardinalityStr = 6; // derived via
NoDictColumnStatisticsCollector
+ int approxCardinalityStr = 5; // derived via
NoDictColumnStatisticsCollector
// TEST 1. Check running forwardIndexHandler on a V1 segment. No-op for
all existing raw columns.
buildV1Segment();
checkForwardIndexCreation(EXISTING_STRING_COL_RAW, approxCardinalityStr,
3, _schema, false,
@@ -456,7 +456,7 @@ public class SegmentPreProcessorTest implements
PinotBuffersAfterClassCheckRule
// TEST 2: EXISTING_STRING_COL_RAW. Enable dictionary on a raw column that
already has text index.
resetIndexConfigs();
- int approxCardinalityStr = 6; // derived via
NoDictColumnStatisticsCollector
+ int approxCardinalityStr = 5; // derived via
NoDictColumnStatisticsCollector
_fieldConfigMap.put(EXISTING_STRING_COL_RAW,
new FieldConfig(EXISTING_STRING_COL_RAW, FieldConfig.EncodingType.RAW,
List.of(FieldConfig.IndexType.TEXT),
null, null));
@@ -642,7 +642,7 @@ public class SegmentPreProcessorTest implements
PinotBuffersAfterClassCheckRule
public void testForwardIndexHandlerChangeCompression()
throws Exception {
int approximateCardinality = 20516; // derived via
NoDictColumnStatisticsCollector
- int approxCardinalityStr = 6; // derived via
NoDictColumnStatisticsCollector
+ int approxCardinalityStr = 5; // derived via
NoDictColumnStatisticsCollector
// Test1: Rewriting forward index will be a no-op for v1 segments. Default
LZ4 compressionType will be retained.
buildV1Segment();
@@ -727,7 +727,7 @@ public class SegmentPreProcessorTest implements
PinotBuffersAfterClassCheckRule
@Test(dataProvider = "bothV1AndV3")
public void testEnableTextIndexOnExistingRawColumn(SegmentVersion
segmentVersion)
throws Exception {
- int approxCardinalityStr = 6; // derived via
NoDictColumnStatisticsCollector
+ int approxCardinalityStr = 5; // derived via
NoDictColumnStatisticsCollector
buildSegment(segmentVersion);
_fieldConfigMap.put(EXISTING_STRING_COL_RAW,
new FieldConfig(EXISTING_STRING_COL_RAW, FieldConfig.EncodingType.RAW,
List.of(FieldConfig.IndexType.TEXT),
@@ -1130,7 +1130,8 @@ public class SegmentPreProcessorTest implements
PinotBuffersAfterClassCheckRule
assertEquals(columnMetadata.getFieldSpec(), spec);
assertTrue(columnMetadata.isAutoGenerated());
originalColumnMetadata = segmentMetadata.getColumnMetadataFor("column3");
- assertEquals(columnMetadata.getCardinality(), 6); // cardinality derived
via NoDictColumnStatisticsCollector
+ // exact cardinality for small-N via NoDictColumnStatisticsCollector
+ assertEquals(columnMetadata.getCardinality(), 5);
assertEquals(columnMetadata.getBitsPerElement(),
originalColumnMetadata.getBitsPerElement());
assertEquals(columnMetadata.getTotalNumberOfEntries(),
originalColumnMetadata.getTotalNumberOfEntries());
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]