Copilot commented on code in PR #18496:
URL: https://github.com/apache/pinot/pull/18496#discussion_r3240826965


##########
pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/loader/ForwardIndexHandler.java:
##########
@@ -1194,56 +1167,67 @@ private void 
rewriteDictToRawForwardIndex(ColumnMetadata columnMetadata, Segment
       File indexDir)
       throws Exception {
     String column = columnMetadata.getColumnName();
-    // Get the forward index reader factory and create a reader
-    IndexReaderFactory<ForwardIndexReader> readerFactory = 
StandardIndexes.forward().getReaderFactory();
-    try (ForwardIndexReader<?> reader = 
readerFactory.createIndexReader(segmentWriter, _fieldIndexConfigs.get(column),
-        columnMetadata)) {
-      Dictionary dictionary = DictionaryIndexType.read(segmentWriter, 
columnMetadata);
-      IndexCreationContext.Builder builder =
-          new IndexCreationContext.Builder(indexDir, _tableConfig, 
columnMetadata).withDictionary(false);
-      // Encoding flows through ForwardIndexConfig set below. The row length 
is derived from persisted metadata via
-      // the Builder constructor; only fall back to scanning when metadata 
returns UNAVAILABLE (var-width MV columns
-      // with varying element lengths).
-      if (columnMetadata.getMaxRowLengthInBytes() == 
ColumnMetadata.UNAVAILABLE) {
-        builder.withMaxRowLengthInBytes(getMaxRowLength(columnMetadata, 
reader, dictionary));
-      }
-      ForwardIndexConfig config = 
_fieldIndexConfigs.get(column).getConfig(StandardIndexes.forward());
-      try (ForwardIndexCreator creator = 
StandardIndexes.forward().createIndexCreator(builder.build(), config)) {
-        forwardIndexRewriteHelper(column, columnMetadata, reader, creator, 
columnMetadata.getTotalDocs(), null,
+    FieldIndexConfigs indexConfigs = _fieldIndexConfigs.get(column);
+    try (ForwardIndexReader<?> forwardIndex = 
StandardIndexes.forward().getReaderFactory()
+        .createIndexReader(segmentWriter, indexConfigs, columnMetadata);
+        Dictionary dictionary = DictionaryIndexType.read(segmentWriter, 
columnMetadata)) {
+      IndexCreationContext context = new 
IndexCreationContext.Builder(indexDir, _tableConfig, columnMetadata).build();
+      ForwardIndexConfig config = 
indexConfigs.getConfig(StandardIndexes.forward());
+      try (ForwardIndexCreator creator = 
StandardIndexes.forward().createIndexCreator(context, config)) {
+        forwardIndexRewriteHelper(column, columnMetadata, forwardIndex, 
creator, columnMetadata.getTotalDocs(), null,
             dictionary);
       }
     }
   }
 
-  /**
-   * Returns the max row length for a column.
-   * - For SV column, this is the length of the longest value.
-   * - For MV column, this is the length of the longest MV entry (sum of 
lengths of all elements).
-   */
-  private int getMaxRowLength(ColumnMetadata columnMetadata, 
ForwardIndexReader<?> forwardIndex,
-      @Nullable Dictionary dictionary)
-      throws IOException {
-    String column = columnMetadata.getColumnName();
+  /// Scans the column and persists the 1.6.0-era length stats when they are 
missing from segment metadata. After
+  /// this returns, per-op handlers can read those stats off [ColumnMetadata] 
when they construct their forward-index
+  /// creator. No-op when:
+  /// - the column is fixed-width (length stats derive from 
`storedType.size()`);
+  /// - the metadata already carries the stats (`getLengthOfShortestElement() 
>= 0` — proxy for "1.6.0 stats
+  ///   present" since the four keys were introduced together);
+  /// - the column has no forward index on disk (those are rebuilt via
+  ///   [InvertedIndexAndDictionaryBasedForwardIndexCreator], which handles 
its own backfill).
+  private void backfillMissingLengthStatsForColumn(String column, 
SegmentDirectory.Writer segmentWriter)
+      throws Exception {
+    ColumnMetadata columnMetadata = 
_segmentDirectory.getSegmentMetadata().getColumnMetadataFor(column);
     DataType storedType = columnMetadata.getDataType().getStoredType();
-    assert !storedType.isFixedWidth();
-
-    try (PinotSegmentColumnReader columnReader = new 
PinotSegmentColumnReader(forwardIndex, dictionary, null,
-        columnMetadata.getMaxNumberOfMultiValues())) {
-      AbstractColumnStatisticsCollector statsCollector = 
getStatsCollector(column, storedType);
-      int numDocs = columnMetadata.getTotalDocs();
-      for (int i = 0; i < numDocs; i++) {
-        statsCollector.collect(columnReader.getValue(i));
+    if (storedType.isFixedWidth() || 
columnMetadata.getLengthOfShortestElement() >= 0) {
+      return;
+    }
+    try (ForwardIndexReader<?> forwardIndex = 
StandardIndexes.forward().getReaderFactory()
+        .createIndexReader(segmentWriter, _fieldIndexConfigs.get(column), 
columnMetadata)) {
+      if (forwardIndex == null) {
+        return;
+      }
+      boolean dictionaryEncoded = forwardIndex.isDictionaryEncoded();
+      try (Dictionary dictionary = dictionaryEncoded ? 
DictionaryIndexType.read(segmentWriter, columnMetadata) : null;
+          PinotSegmentColumnReader columnReader = new 
PinotSegmentColumnReader(forwardIndex, dictionary, null,
+              columnMetadata.getMaxNumberOfMultiValues())) {
+        AbstractColumnStatisticsCollector statsCollector = 
getStatsCollector(column, storedType, false);
+        int numDocs = columnMetadata.getTotalDocs();
+        for (int i = 0; i < numDocs; i++) {
+          statsCollector.collect(columnReader.getValue(i));
+        }
+        Map<String, String> metadataProperties = new HashMap<>();
+        metadataProperties.put(getKeyFor(column, FORWARD_INDEX_ENCODING),
+            dictionaryEncoded ? EncodingType.DICTIONARY.name() : 
EncodingType.RAW.name());
+        metadataProperties.put(getKeyFor(column, LENGTH_OF_SHORTEST_ELEMENT),
+            String.valueOf(statsCollector.getLengthOfShortestElement()));

Review Comment:
   For an empty STRING column this backfill can persist `Integer.MAX_VALUE` as 
`LENGTH_OF_SHORTEST_ELEMENT`: `StringColumnPreIndexStatsCollector` initializes 
the minimum length to `Integer.MAX_VALUE` and returns it unchanged when no rows 
are collected, while empty segments are supported elsewhere in the stats path. 
Please normalize the no-value case to 0 before writing metadata so subsequent 
reloads do not see an invalid shortest length.
   



##########
pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/loader/ForwardIndexHandler.java:
##########
@@ -1194,56 +1167,67 @@ private void 
rewriteDictToRawForwardIndex(ColumnMetadata columnMetadata, Segment
       File indexDir)
       throws Exception {
     String column = columnMetadata.getColumnName();
-    // Get the forward index reader factory and create a reader
-    IndexReaderFactory<ForwardIndexReader> readerFactory = 
StandardIndexes.forward().getReaderFactory();
-    try (ForwardIndexReader<?> reader = 
readerFactory.createIndexReader(segmentWriter, _fieldIndexConfigs.get(column),
-        columnMetadata)) {
-      Dictionary dictionary = DictionaryIndexType.read(segmentWriter, 
columnMetadata);
-      IndexCreationContext.Builder builder =
-          new IndexCreationContext.Builder(indexDir, _tableConfig, 
columnMetadata).withDictionary(false);
-      // Encoding flows through ForwardIndexConfig set below. The row length 
is derived from persisted metadata via
-      // the Builder constructor; only fall back to scanning when metadata 
returns UNAVAILABLE (var-width MV columns
-      // with varying element lengths).
-      if (columnMetadata.getMaxRowLengthInBytes() == 
ColumnMetadata.UNAVAILABLE) {
-        builder.withMaxRowLengthInBytes(getMaxRowLength(columnMetadata, 
reader, dictionary));
-      }
-      ForwardIndexConfig config = 
_fieldIndexConfigs.get(column).getConfig(StandardIndexes.forward());
-      try (ForwardIndexCreator creator = 
StandardIndexes.forward().createIndexCreator(builder.build(), config)) {
-        forwardIndexRewriteHelper(column, columnMetadata, reader, creator, 
columnMetadata.getTotalDocs(), null,
+    FieldIndexConfigs indexConfigs = _fieldIndexConfigs.get(column);
+    try (ForwardIndexReader<?> forwardIndex = 
StandardIndexes.forward().getReaderFactory()
+        .createIndexReader(segmentWriter, indexConfigs, columnMetadata);
+        Dictionary dictionary = DictionaryIndexType.read(segmentWriter, 
columnMetadata)) {
+      IndexCreationContext context = new 
IndexCreationContext.Builder(indexDir, _tableConfig, columnMetadata).build();
+      ForwardIndexConfig config = 
indexConfigs.getConfig(StandardIndexes.forward());
+      try (ForwardIndexCreator creator = 
StandardIndexes.forward().createIndexCreator(context, config)) {
+        forwardIndexRewriteHelper(column, columnMetadata, forwardIndex, 
creator, columnMetadata.getTotalDocs(), null,
             dictionary);
       }
     }
   }
 
-  /**
-   * Returns the max row length for a column.
-   * - For SV column, this is the length of the longest value.
-   * - For MV column, this is the length of the longest MV entry (sum of 
lengths of all elements).
-   */
-  private int getMaxRowLength(ColumnMetadata columnMetadata, 
ForwardIndexReader<?> forwardIndex,
-      @Nullable Dictionary dictionary)
-      throws IOException {
-    String column = columnMetadata.getColumnName();
+  /// Scans the column and persists the 1.6.0-era length stats when they are 
missing from segment metadata. After
+  /// this returns, per-op handlers can read those stats off [ColumnMetadata] 
when they construct their forward-index
+  /// creator. No-op when:
+  /// - the column is fixed-width (length stats derive from 
`storedType.size()`);
+  /// - the metadata already carries the stats (`getLengthOfShortestElement() 
>= 0` — proxy for "1.6.0 stats
+  ///   present" since the four keys were introduced together);
+  /// - the column has no forward index on disk (those are rebuilt via
+  ///   [InvertedIndexAndDictionaryBasedForwardIndexCreator], which handles 
its own backfill).
+  private void backfillMissingLengthStatsForColumn(String column, 
SegmentDirectory.Writer segmentWriter)
+      throws Exception {
+    ColumnMetadata columnMetadata = 
_segmentDirectory.getSegmentMetadata().getColumnMetadataFor(column);
     DataType storedType = columnMetadata.getDataType().getStoredType();
-    assert !storedType.isFixedWidth();
-
-    try (PinotSegmentColumnReader columnReader = new 
PinotSegmentColumnReader(forwardIndex, dictionary, null,
-        columnMetadata.getMaxNumberOfMultiValues())) {
-      AbstractColumnStatisticsCollector statsCollector = 
getStatsCollector(column, storedType);
-      int numDocs = columnMetadata.getTotalDocs();
-      for (int i = 0; i < numDocs; i++) {
-        statsCollector.collect(columnReader.getValue(i));
+    if (storedType.isFixedWidth() || 
columnMetadata.getLengthOfShortestElement() >= 0) {
+      return;
+    }
+    try (ForwardIndexReader<?> forwardIndex = 
StandardIndexes.forward().getReaderFactory()

Review Comment:
   This new pre-pass is the behavior that guarantees compression rewrites and 
dict-to-raw rewrites can rely on `ColumnMetadata`, but the existing 
`ForwardIndexHandlerTest` coverage does not exercise backfilling missing 
`LENGTH_OF_SHORTEST_ELEMENT` / `LENGTH_OF_LONGEST_ELEMENT` / 
`MAX_ROW_LENGTH_IN_BYTES` / `IS_ASCII` metadata. Please add a reload/rewrite 
test with pre-1.6.0-style metadata so regressions here are caught.



##########
pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/loader/InvertedIndexAndDictionaryBasedForwardIndexCreator.java:
##########
@@ -255,33 +257,64 @@ private Map<String, String> 
createForwardIndexForSVColumn()
         (BitmapInvertedIndexReader) InvertedIndexType.ReaderFactory
             .INSTANCE.createSkippingForward(_segmentWriter, _columnMetadata);
         Dictionary dictionary = DictionaryIndexType.read(_segmentWriter, 
_columnMetadata)) {
-      // Construct the forward index in the values buffer
+      // Construct the forward index in the values buffer. For var-length 
columns, also gather per-element stats
+      // (lengthOfShortest/Longest, isAscii for STRING) inline when the source 
segment is missing them, so the
+      // backfill happens without a second dictionary scan.
+      DataType storedType = _columnMetadata.getStoredType();
+      boolean backfillStats =
+          !storedType.isFixedWidth() && 
_columnMetadata.getLengthOfShortestElement() < 0;

Review Comment:
   The inline dictionary-scan backfill path is new production behavior, but 
there is no test coverage for regenerating a forward index from inverted index 
+ dictionary when the source metadata lacks the new length stats. Please add 
coverage for SV/MV variable-length columns (including `MAX_ROW_LENGTH_IN_BYTES` 
for MV) so this persisted metadata is verified.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to