Copilot commented on code in PR #16845:
URL: https://github.com/apache/pinot/pull/16845#discussion_r2373423934


##########
pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/stats/NoDictColumnStatisticsCollector.java:
##########
@@ -0,0 +1,194 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.pinot.segment.local.segment.creator.impl.stats;
+
+import java.math.BigDecimal;
+import java.nio.charset.StandardCharsets;
+import org.apache.commons.lang3.NotImplementedException;
+import org.apache.pinot.segment.spi.creator.StatsCollectorConfig;
+import org.apache.pinot.spi.utils.BigDecimalUtils;
+import org.apache.pinot.spi.utils.ByteArray;
+
+
+/**
+ * Column statistics collector for no-dictionary columns that avoids storing 
unique values and thus reduces memory
+ * Behavior:
+ * - getUniqueValuesSet() throws NotImplementedException
+ * - getCardinality() returns total entries collected
+ */
+@SuppressWarnings({"rawtypes", "unchecked"})
+public class NoDictColumnStatisticsCollector extends 
AbstractColumnStatisticsCollector {
+  private Comparable _minValue;
+  private Comparable _maxValue;
+  private int _minLength = Integer.MAX_VALUE;
+  private int _maxLength = -1; // default return value is -1
+  private int _maxRowLength = -1; // default return value is -1
+  private boolean _sealed = false;
+
+  public NoDictColumnStatisticsCollector(String column, StatsCollectorConfig 
statsCollectorConfig) {
+    super(column, statsCollectorConfig);
+  }
+
+  @Override
+  public void collect(Object entry) {
+    assert !_sealed;
+    if (entry instanceof Object[]) {
+      Object[] values = (Object[]) entry;
+      int rowLength = 0;
+      for (Object value : values) {
+        if (value instanceof BigDecimal) {
+          // BigDecimalColumnPreIndexStatsCollector doesn't support multi-value
+          throw new UnsupportedOperationException();
+        }
+        updateMinMax(value);
+        int len = getValueLength(value);
+        _minLength = Math.min(_minLength, len);
+        _maxLength = Math.max(_maxLength, len);
+        rowLength += len;
+      }
+      _maxNumberOfMultiValues = Math.max(_maxNumberOfMultiValues, 
values.length);
+      _maxRowLength = Math.max(_maxRowLength, rowLength);
+      updateTotalNumberOfEntries(values);
+    } else if (entry instanceof int[] || entry instanceof long[]
+        || entry instanceof float[] || entry instanceof double[]) {
+      // Native multi-value types don't require length calculation
+      int length;
+      if (entry instanceof int[]) {
+        int[] values = (int[]) entry;
+        for (int value : values) {
+          updateMinMax(value);
+        }
+        length = values.length;
+      } else if (entry instanceof long[]) {
+        long[] values = (long[]) entry;
+        for (long value : values) {
+          updateMinMax(value);
+        }
+        length = values.length;
+      } else if (entry instanceof float[]) {
+        float[] values = (float[]) entry;
+        for (float value : values) {
+          updateMinMax(value);
+        }
+        length = values.length;
+      } else {
+        double[] values = (double[]) entry;
+        for (double value : values) {
+          updateMinMax(value);
+        }
+        length = values.length;
+      }
+      _maxNumberOfMultiValues = Math.max(_maxNumberOfMultiValues, length);
+      updateTotalNumberOfEntries(length);
+    } else {
+      addressSorted(toComparable(entry));
+      updateMinMax(entry);
+      int len = getValueLength(entry);
+      _minLength = Math.min(_minLength, len);
+      _maxLength = Math.max(_maxLength, len);
+      if (isPartitionEnabled()) {
+        updatePartition(entry.toString());
+      }
+      _maxRowLength = Math.max(_maxRowLength, len);
+      _totalNumberOfEntries++;
+    }
+  }
+
+  private void updateMinMax(Object value) {
+    Comparable comp = toComparable(value);
+    if (_minValue == null || comp.compareTo(_minValue) < 0) {
+      _minValue = comp;
+    }
+    if (_maxValue == null || comp.compareTo(_maxValue) > 0) {
+      _maxValue = comp;
+    }
+  }
+
+  private Comparable toComparable(Object value) {
+    if (value instanceof byte[]) {
+      return new ByteArray((byte[]) value);
+    }
+    if (value instanceof Comparable) {
+      return (Comparable) value;
+    }
+    throw new IllegalStateException("Unsupported value type " + 
value.getClass());
+  }
+
+  private int getValueLength(Object value) {
+    if (value instanceof byte[]) {
+      return ((byte[]) value).length;
+    }
+    if (value instanceof CharSequence) {
+      return ((CharSequence) 
value).toString().getBytes(StandardCharsets.UTF_8).length;
+    }
+    if (value instanceof BigDecimal) {
+      return BigDecimalUtils.byteSize((BigDecimal) value);
+    }
+    if (value instanceof Number) {
+      return 8; // fixed-width approximation; not used for fixed types' length 
assertions generally

Review Comment:
   This comment is unclear about when this approximation is appropriate and 
what the implications are. Consider providing more specific documentation about 
which numeric types this applies to and why 8 bytes is the chosen approximation.
   ```suggestion
         // For all Java Number types (e.g., Integer, Long, Float, Double), we 
approximate the length as 8 bytes,
         // which is the size of a Java double or long (the largest primitive 
numeric types).
         // This is a conservative upper bound and may not reflect the actual 
in-memory or serialized size for all Number types.
         // This approximation is only used as a fallback and is not used for 
length assertions for fixed-width types.
         return 8;
   ```



##########
pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/creator/ColumnIndexCreationInfo.java:
##########
@@ -64,11 +65,23 @@ public Object getMax() {
   }
 
   public Object getSortedUniqueElementsArray() {
-    return _columnStatistics.getUniqueValuesSet();
+    try {
+      return _columnStatistics.getUniqueValuesSet();
+    } catch (NotImplementedException e) {
+      return null;
+    }

Review Comment:
   [nitpick] This try-catch approach for control flow is not ideal. Consider 
using a flag or method to check if unique values are supported before calling 
getUniqueValuesSet(), rather than relying on exception handling for normal 
operation.



##########
pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/stats/NoDictColumnStatisticsCollector.java:
##########
@@ -0,0 +1,194 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.pinot.segment.local.segment.creator.impl.stats;
+
+import java.math.BigDecimal;
+import java.nio.charset.StandardCharsets;
+import org.apache.commons.lang3.NotImplementedException;
+import org.apache.pinot.segment.spi.creator.StatsCollectorConfig;
+import org.apache.pinot.spi.utils.BigDecimalUtils;
+import org.apache.pinot.spi.utils.ByteArray;
+
+
+/**
+ * Column statistics collector for no-dictionary columns that avoids storing 
unique values and thus reduces memory
+ * Behavior:
+ * - getUniqueValuesSet() throws NotImplementedException
+ * - getCardinality() returns total entries collected
+ */
+@SuppressWarnings({"rawtypes", "unchecked"})
+public class NoDictColumnStatisticsCollector extends 
AbstractColumnStatisticsCollector {
+  private Comparable _minValue;
+  private Comparable _maxValue;
+  private int _minLength = Integer.MAX_VALUE;
+  private int _maxLength = -1; // default return value is -1
+  private int _maxRowLength = -1; // default return value is -1
+  private boolean _sealed = false;
+
+  public NoDictColumnStatisticsCollector(String column, StatsCollectorConfig 
statsCollectorConfig) {
+    super(column, statsCollectorConfig);
+  }
+
+  @Override
+  public void collect(Object entry) {
+    assert !_sealed;
+    if (entry instanceof Object[]) {
+      Object[] values = (Object[]) entry;
+      int rowLength = 0;
+      for (Object value : values) {
+        if (value instanceof BigDecimal) {
+          // BigDecimalColumnPreIndexStatsCollector doesn't support multi-value
+          throw new UnsupportedOperationException();

Review Comment:
   The error message should be more descriptive. Consider providing details 
about why BigDecimal multi-value is not supported and what alternatives are 
available.
   ```suggestion
             throw new UnsupportedOperationException(
                 "Multi-value columns of type BigDecimal are not supported. " +
                 "BigDecimalColumnPreIndexStatsCollector only supports 
single-value columns. " +
                 "Consider using a single-value column or a supported data type 
for multi-value columns.");
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to