This is an automated email from the ASF dual-hosted git repository.

xiangfu pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/pinot.git


The following commit(s) were added to refs/heads/master by this push:
     new 5172ada0e4f segment-local: NoDict hybrid cardinality (exact ≤ 2,048 → 
HLL++ +10% cap) (#17186)
5172ada0e4f is described below

commit 5172ada0e4ff343dab14441a4aa3322989b2dd45
Author: Xiang Fu <[email protected]>
AuthorDate: Thu Nov 13 11:19:52 2025 -0800

    segment-local: NoDict hybrid cardinality (exact ≤ 2,048 → HLL++ +10% cap) 
(#17186)
    
    - Track exact uniques up to 2,048; return exact count while active
    - On threshold exceed, drop exact set and estimate via HLL++
    - Apply +10% upward buffer, clamped to total entries, to avoid 
underestimation
    - Reduce small-N flakiness; preserve safety at scale
    
    Tests
    - SegmentPreProcessorTest: expect exact small-N cardinality 5 for derived 
raw string
    - NoDictColumnStatisticsCollectorTest: assertion message refactor; 
tolerance unchanged
    
    Implementation notes
    - Add EXACT_UNIQUE_TRACKING_THRESHOLD and trackExactUnique()
    - Normalize keys for byte[] and BigDecimal during exact tracking
---
 .../stats/NoDictColumnStatisticsCollector.java     | 45 ++++++++++++++++++++--
 .../stats/NoDictColumnStatisticsCollectorTest.java |  8 ++--
 .../index/loader/SegmentPreProcessorTest.java      | 11 +++---
 3 files changed, 53 insertions(+), 11 deletions(-)

diff --git 
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/stats/NoDictColumnStatisticsCollector.java
 
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/stats/NoDictColumnStatisticsCollector.java
index 4962bd95db4..f047c56c479 100644
--- 
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/stats/NoDictColumnStatisticsCollector.java
+++ 
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/stats/NoDictColumnStatisticsCollector.java
@@ -21,6 +21,8 @@ package 
org.apache.pinot.segment.local.segment.creator.impl.stats;
 import com.clearspring.analytics.stream.cardinality.HyperLogLogPlus;
 import java.math.BigDecimal;
 import java.nio.charset.StandardCharsets;
+import java.util.HashSet;
+import java.util.Set;
 import org.apache.commons.lang3.NotImplementedException;
 import org.apache.pinot.segment.spi.creator.StatsCollectorConfig;
 import org.apache.pinot.spi.utils.BigDecimalUtils;
@@ -52,6 +54,10 @@ public class NoDictColumnStatisticsCollector extends 
AbstractColumnStatisticsCol
   private boolean _sealed = false;
   // HLL Plus generally returns approximate cardinality >= actual cardinality 
which is desired
   private final HyperLogLogPlus _hllPlus;
+  // Track exact uniques up to a threshold to avoid small-N underestimation 
and test flakiness
+  private static final int EXACT_UNIQUE_TRACKING_THRESHOLD = 2048;
+  // Single-threaded: simple Set is sufficient; set to null once threshold 
exceeded to cap memory
+  private Set<Object> _exactUniques = new HashSet<>();
 
   public NoDictColumnStatisticsCollector(String column, StatsCollectorConfig 
statsCollectorConfig) {
     super(column, statsCollectorConfig);
@@ -75,6 +81,7 @@ public class NoDictColumnStatisticsCollector extends 
AbstractColumnStatisticsCol
         }
         updateMinMax(value);
         updateHllPlus(value);
+        trackExactUnique(value);
         int len = getValueLength(value);
         _minLength = Math.min(_minLength, len);
         _maxLength = Math.max(_maxLength, len);
@@ -92,6 +99,7 @@ public class NoDictColumnStatisticsCollector extends 
AbstractColumnStatisticsCol
         for (int value : values) {
           updateMinMax(value);
           updateHllPlus(value);
+          trackExactUnique(value);
         }
         length = values.length;
       } else if (entry instanceof long[]) {
@@ -99,6 +107,7 @@ public class NoDictColumnStatisticsCollector extends 
AbstractColumnStatisticsCol
         for (long value : values) {
           updateMinMax(value);
           updateHllPlus(value);
+          trackExactUnique(value);
         }
         length = values.length;
       } else if (entry instanceof float[]) {
@@ -106,6 +115,7 @@ public class NoDictColumnStatisticsCollector extends 
AbstractColumnStatisticsCol
         for (float value : values) {
           updateMinMax(value);
           updateHllPlus(value);
+          trackExactUnique(value);
         }
         length = values.length;
       } else {
@@ -113,6 +123,7 @@ public class NoDictColumnStatisticsCollector extends 
AbstractColumnStatisticsCol
         for (double value : values) {
           updateMinMax(value);
           updateHllPlus(value);
+          trackExactUnique(value);
         }
         length = values.length;
       }
@@ -123,6 +134,7 @@ public class NoDictColumnStatisticsCollector extends 
AbstractColumnStatisticsCol
       addressSorted(value);
       updateMinMax(entry);
       updateHllPlus(entry);
+      trackExactUnique(entry);
       int len = getValueLength(entry);
       _minLength = Math.min(_minLength, len);
       _maxLength = Math.max(_maxLength, len);
@@ -208,8 +220,11 @@ public class NoDictColumnStatisticsCollector extends 
AbstractColumnStatisticsCol
 
   @Override
   public int getCardinality() {
-    // Get approximate distinct count estimate using HLL++
-    // Increase by 10% to increase probability of not returning lower than 
actual cardinality
+    // If we are still tracking exact uniques, return exact cardinality
+    if (_exactUniques != null) {
+      return _exactUniques.size();
+    }
+    // Get approximate distinct count estimate using HLL++ with a 10% upward 
buffer
     long estimate = Math.round(_hllPlus.cardinality() * 1.1);
     // There are cases where approximation can overshoot the actual number of 
entries.
     // Returning a cardinality greater than total entries can break 
assumptions.
@@ -223,10 +238,34 @@ public class NoDictColumnStatisticsCollector extends 
AbstractColumnStatisticsCol
 
   private void updateHllPlus(Object value) {
     if (value instanceof BigDecimal) {
-      // Canonicalize BigDecimal as string to avoid scale-related equality 
issues
+      // Canonicalize BigDecimal as string to avoid scale-related equality 
issues:
+      // BigDecimals with different scales (e.g., 1.0 vs 1.00) are not equal 
by default,
+      // but their string representations normalize the value for cardinality 
tracking.
       _hllPlus.offer(((BigDecimal) value).toString());
     } else {
       _hllPlus.offer(value);
     }
   }
+
+  private void trackExactUnique(Object value) {
+    if (_exactUniques == null) {
+      return;
+    }
+    Object key;
+    if (value instanceof byte[]) {
+      key = new ByteArray((byte[]) value);
+    } else if (value instanceof BigDecimal) {
+      // Use string representation to avoid scale-related equality issues:
+      // BigDecimals with different scales (e.g., 1.0 vs 1.00) are not equal 
by default,
+      // but their string representations normalize the value for cardinality 
tracking.
+      key = ((BigDecimal) value).toString();
+    } else {
+      key = value;
+    }
+    _exactUniques.add(key);
+    if (_exactUniques.size() > EXACT_UNIQUE_TRACKING_THRESHOLD) {
+      // Drop exact tracking once the threshold is exceeded to avoid unbounded 
memory.
+      _exactUniques = null;
+    }
+  }
 }
diff --git 
a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/creator/impl/stats/NoDictColumnStatisticsCollectorTest.java
 
b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/creator/impl/stats/NoDictColumnStatisticsCollectorTest.java
index 3600f2f0562..546722c231f 100644
--- 
a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/creator/impl/stats/NoDictColumnStatisticsCollectorTest.java
+++ 
b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/creator/impl/stats/NoDictColumnStatisticsCollectorTest.java
@@ -781,9 +781,11 @@ public class NoDictColumnStatisticsCollectorTest {
     String context = String.format("Iteration %d, DataType: %s, SingleValue: 
%s, Data: %s",
         iteration, dataType, isSingleValue, Arrays.deepToString(testData));
 
-    assertTrue(noDictCollector.getCardinality() >= 
expectedCollector.getCardinality(),
-        "Approx Cardinality " + noDictCollector.getCardinality() + " is lower 
than actual cardinality "
-            + expectedCollector.getCardinality() + " for " + context);
+    int approx = noDictCollector.getCardinality();
+    int exact = expectedCollector.getCardinality();
+    assertTrue(approx >= exact,
+        "Approx Cardinality " + approx + " is lower than actual cardinality "
+            + exact + " for " + context);
     assertEquals(noDictCollector.getMinValue(), 
expectedCollector.getMinValue(),
         "MinValue mismatch - " + context);
     assertEquals(noDictCollector.getMaxValue(), 
expectedCollector.getMaxValue(),
diff --git 
a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/loader/SegmentPreProcessorTest.java
 
b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/loader/SegmentPreProcessorTest.java
index 0263369bd10..4807763233a 100644
--- 
a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/loader/SegmentPreProcessorTest.java
+++ 
b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/loader/SegmentPreProcessorTest.java
@@ -391,7 +391,7 @@ public class SegmentPreProcessorTest implements 
PinotBuffersAfterClassCheckRule
   public void testSimpleEnableDictionarySV()
       throws Exception {
     int approxCardinality = 46934; // derived via 
NoDictColumnStatisticsCollector
-    int approxCardinalityStr = 6; // derived via 
NoDictColumnStatisticsCollector
+    int approxCardinalityStr = 5; // derived via 
NoDictColumnStatisticsCollector
     // TEST 1. Check running forwardIndexHandler on a V1 segment. No-op for 
all existing raw columns.
     buildV1Segment();
     checkForwardIndexCreation(EXISTING_STRING_COL_RAW, approxCardinalityStr, 
3, _schema, false,
@@ -456,7 +456,7 @@ public class SegmentPreProcessorTest implements 
PinotBuffersAfterClassCheckRule
 
     // TEST 2: EXISTING_STRING_COL_RAW. Enable dictionary on a raw column that 
already has text index.
     resetIndexConfigs();
-    int approxCardinalityStr = 6; // derived via 
NoDictColumnStatisticsCollector
+    int approxCardinalityStr = 5; // derived via 
NoDictColumnStatisticsCollector
     _fieldConfigMap.put(EXISTING_STRING_COL_RAW,
         new FieldConfig(EXISTING_STRING_COL_RAW, FieldConfig.EncodingType.RAW, 
List.of(FieldConfig.IndexType.TEXT),
             null, null));
@@ -642,7 +642,7 @@ public class SegmentPreProcessorTest implements 
PinotBuffersAfterClassCheckRule
   public void testForwardIndexHandlerChangeCompression()
       throws Exception {
     int approximateCardinality = 20516; // derived via 
NoDictColumnStatisticsCollector
-    int approxCardinalityStr = 6; // derived via 
NoDictColumnStatisticsCollector
+    int approxCardinalityStr = 5; // derived via 
NoDictColumnStatisticsCollector
 
     // Test1: Rewriting forward index will be a no-op for v1 segments. Default 
LZ4 compressionType will be retained.
     buildV1Segment();
@@ -727,7 +727,7 @@ public class SegmentPreProcessorTest implements 
PinotBuffersAfterClassCheckRule
   @Test(dataProvider = "bothV1AndV3")
   public void testEnableTextIndexOnExistingRawColumn(SegmentVersion 
segmentVersion)
       throws Exception {
-    int approxCardinalityStr = 6; // derived via 
NoDictColumnStatisticsCollector
+    int approxCardinalityStr = 5; // derived via 
NoDictColumnStatisticsCollector
     buildSegment(segmentVersion);
     _fieldConfigMap.put(EXISTING_STRING_COL_RAW,
         new FieldConfig(EXISTING_STRING_COL_RAW, FieldConfig.EncodingType.RAW, 
List.of(FieldConfig.IndexType.TEXT),
@@ -1130,7 +1130,8 @@ public class SegmentPreProcessorTest implements 
PinotBuffersAfterClassCheckRule
     assertEquals(columnMetadata.getFieldSpec(), spec);
     assertTrue(columnMetadata.isAutoGenerated());
     originalColumnMetadata = segmentMetadata.getColumnMetadataFor("column3");
-    assertEquals(columnMetadata.getCardinality(), 6); // cardinality derived 
via NoDictColumnStatisticsCollector
+    // exact cardinality for small-N via NoDictColumnStatisticsCollector
+    assertEquals(columnMetadata.getCardinality(), 5);
     assertEquals(columnMetadata.getBitsPerElement(), 
originalColumnMetadata.getBitsPerElement());
     assertEquals(columnMetadata.getTotalNumberOfEntries(), 
originalColumnMetadata.getTotalNumberOfEntries());
 


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to